CSV

1
2
3
4
5
import pandas as pd

pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.width', 85)
pd.set_option('display.max_columns', 8)
1
2
3
4
5
6
7
8
landtemps = pd.read_csv('landtempssample.csv',
        names=['stationid', 'year', 'month', 'avgtemp',
            'latitude', 'longitude', 'elevation', 'station',
            'countryid', 'country'],
            skiprows=1,
            parse_dates=[['month', 'year']],
            low_memory=False)
type(landtemps)
1
pandas.core.frame.DataFrame
  • skiprows=1 -- skips the first row (since these are the headers);
  • parse_dates= converts the month and year columns to a date value.
1
landtemps.shape, landtemps.dtypes
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
((100000, 9),
month_year    datetime64[ns]
stationid             object
avgtemp              float64
latitude             float64
longitude            float64
elevation            float64
station               object
countryid             object
country               object
dtype: object)
1
landtemps.head(7)
month_year stationid avgtemp latitude ... elevation station countryid country
0 2000-04-01 USS0010K01S 5.27 39.90 ... 2,773.70 INDIAN_CANYON US United States
1 1940-05-01 CI000085406 18.04 -18.35 ... 58.00 ARICA CI Chile
2 2013-12-01 USC00036376 6.22 34.37 ... 61.00 SAINT_CHARLES US United States
3 1963-02-01 ASN00024002 22.93 -34.28 ... 65.50 BERRI_IRRIGATION AS Australia
4 2001-11-01 ASN00028007 NaN -14.78 ... 79.40 MUSGRAVE AS Australia
5 1991-04-01 USW00024151 5.59 42.15 ... 1,362.50 MALAD_CITY US United States
6 1993-12-01 RSM00022641 -10.17 63.90 ... 13.00 ONEGA RS Russia

7 rows × 9 columns

1
landtemps.describe()
avgtemp latitude longitude elevation
count 85,554.00 100,000.00 100,000.00 100,000.00
mean 10.92 35.08 -38.12 565.56
std 11.52 23.55 84.30 1,073.65
min -70.70 -90.00 -179.98 -350.00
25% 3.46 33.19 -101.66 64.00
50% 12.22 40.79 -79.57 240.80
75% 19.57 47.42 17.53 644.70
max 39.95 82.52 179.75 9,999.00
1
2
landtemps.rename(columns={'month_year':'measuredate'}, inplace=True)
landtemps.dtypes
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
measuredate    datetime64[ns]
stationid              object
avgtemp               float64
latitude              float64
longitude             float64
elevation             float64
station                object
countryid              object
country                object
dtype: object
1
landtemps.avgtemp.describe()
1
2
3
4
5
6
7
8
9
count   85,554.00
mean        10.92
std         11.52
min        -70.70
25%          3.46
50%         12.22
75%         19.57
max         39.95
Name: avgtemp, dtype: float64

Below, we use isnull() chained with sum() to find and count each column's missing values:

1
landtemps.isnull().sum()
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
measuredate        0
stationid          0
avgtemp        14446
latitude           0
longitude          0
elevation          0
station            0
countryid          0
country            5
dtype: int64
  • This removes rows in avgtemp that are NULL:
1
2
landtemps.dropna(subset=['avgtemp'], inplace=True)
landtemps.shape
1
(85554, 9)