CSV
| import pandas as pd
pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.width', 85)
pd.set_option('display.max_columns', 8)
|
| landtemps = pd.read_csv('landtempssample.csv',
names=['stationid', 'year', 'month', 'avgtemp',
'latitude', 'longitude', 'elevation', 'station',
'countryid', 'country'],
skiprows=1,
parse_dates=[['month', 'year']],
low_memory=False)
type(landtemps)
|
| pandas.core.frame.DataFrame
|
skiprows=1 -- skips the first row (since these are the headers);
parse_dates= converts the month and year columns to a date value.
| landtemps.shape, landtemps.dtypes
|
| ((100000, 9),
month_year datetime64[ns]
stationid object
avgtemp float64
latitude float64
longitude float64
elevation float64
station object
countryid object
country object
dtype: object)
|
|
month_year |
stationid |
avgtemp |
latitude |
... |
elevation |
station |
countryid |
country |
| 0 |
2000-04-01 |
USS0010K01S |
5.27 |
39.90 |
... |
2,773.70 |
INDIAN_CANYON |
US |
United States |
| 1 |
1940-05-01 |
CI000085406 |
18.04 |
-18.35 |
... |
58.00 |
ARICA |
CI |
Chile |
| 2 |
2013-12-01 |
USC00036376 |
6.22 |
34.37 |
... |
61.00 |
SAINT_CHARLES |
US |
United States |
| 3 |
1963-02-01 |
ASN00024002 |
22.93 |
-34.28 |
... |
65.50 |
BERRI_IRRIGATION |
AS |
Australia |
| 4 |
2001-11-01 |
ASN00028007 |
NaN |
-14.78 |
... |
79.40 |
MUSGRAVE |
AS |
Australia |
| 5 |
1991-04-01 |
USW00024151 |
5.59 |
42.15 |
... |
1,362.50 |
MALAD_CITY |
US |
United States |
| 6 |
1993-12-01 |
RSM00022641 |
-10.17 |
63.90 |
... |
13.00 |
ONEGA |
RS |
Russia |
7 rows × 9 columns
|
avgtemp |
latitude |
longitude |
elevation |
| count |
85,554.00 |
100,000.00 |
100,000.00 |
100,000.00 |
| mean |
10.92 |
35.08 |
-38.12 |
565.56 |
| std |
11.52 |
23.55 |
84.30 |
1,073.65 |
| min |
-70.70 |
-90.00 |
-179.98 |
-350.00 |
| 25% |
3.46 |
33.19 |
-101.66 |
64.00 |
| 50% |
12.22 |
40.79 |
-79.57 |
240.80 |
| 75% |
19.57 |
47.42 |
17.53 |
644.70 |
| max |
39.95 |
82.52 |
179.75 |
9,999.00 |
| landtemps.rename(columns={'month_year':'measuredate'}, inplace=True)
landtemps.dtypes
|
| measuredate datetime64[ns]
stationid object
avgtemp float64
latitude float64
longitude float64
elevation float64
station object
countryid object
country object
dtype: object
|
| landtemps.avgtemp.describe()
|
| count 85,554.00
mean 10.92
std 11.52
min -70.70
25% 3.46
50% 12.22
75% 19.57
max 39.95
Name: avgtemp, dtype: float64
|
Below, we use isnull() chained with sum() to find and count each column's missing values:
| measuredate 0
stationid 0
avgtemp 14446
latitude 0
longitude 0
elevation 0
station 0
countryid 0
country 5
dtype: int64
|
- This removes rows in
avgtemp that are NULL:
| landtemps.dropna(subset=['avgtemp'], inplace=True)
landtemps.shape
|