SQL

1
2
3
4
5
6
7
8
import pandas as pd
import numpy as np
import pymssql
import mysql.connector

pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.width', 85)
pd.set_option('display.max_columns', 8)
  • Use the pymssql API and read_sql to retrieve and load data from a SQL Server instance. Select the columns we want from the SQL Server data and use SQL aliases to improve column names (for example, fedu AS fathereducation). Create a connection to the SQL Server data by passing database credentials to the pymssql connect function. Create a pandas data frame by passing the select statement and connection object to read_sql. Close the connection to return it to the pool on the server:
1
2
3
4
5
query = "SELECT studentid, school, sex, age, famsize,\
    medu AS mothereducation, fedu AS fathereducation,\
    traveltime, studytime, failures, famrel, freetime,\
    goout, g1 AS gradeperiod1, g2 AS gradeperiod2,\
    g3 AS gradeperiod3 FROM studentmath"
1
2
3
4
5
6
7
8
9
server = 'pdcc.c9sqqzd5fulv.us-west-2.rds.amazonaws.com'
user = 'pdccuser'
password = 'pdccpass'
database = 'pdcctest'

conn = pymssql.connect(server=server, user=user, password=password, database=database)

studentmath = pd.read_sql(query, conn)
conn.close()
1
2
/var/folders/r5/kwzghqz92v73843mkcq0vhp40000gn/T/ipykernel_6242/2728001321.py:8: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
  studentmath = pd.read_sql(query, conn)
1
studentmath.dtypes
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
studentid          object
school             object
sex                object
age                 int64
famsize            object
mothereducation     int64
fathereducation     int64
traveltime          int64
studytime           int64
failures            int64
famrel              int64
freetime            int64
goout               int64
gradeperiod1        int64
gradeperiod2        int64
gradeperiod3        int64
dtype: object
1
studentmath.head()
studentid school sex age ... goout gradeperiod1 gradeperiod2 gradeperiod3
0 001 GP F 18 ... 4 5 6 6
1 002 GP F 17 ... 3 5 5 6
2 003 GP F 15 ... 2 7 8 10
3 004 GP F 15 ... 2 15 14 15
4 005 GP F 16 ... 2 6 10 10

5 rows × 16 columns

1
2
3
4
5
6
newcolorder = ['studentid', 'gradeperiod1', 'gradeperiod2',
    'gradeperiod3', 'school', 'sex', 'age', 'famsize',
    'mothereducation', 'fathereducation', 'traveltime',
    'studytime', 'freetime', 'failures', 'famrel', 'goout']

studentmath = studentmath[newcolorder]
1
studentmath.studentid.count()
1
395
1
studentmath.studentid.nunique()
1
395
1
studentmath.set_index('studentid', inplace=True)
1
studentmath.count()
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
gradeperiod1       395
gradeperiod2       395
gradeperiod3       395
school             395
sex                395
age                395
famsize            395
mothereducation    395
fathereducation    395
traveltime         395
studytime          395
freetime           395
failures           395
famrel             395
goout              395
dtype: int64
  • Replace coded data values with more informative values:
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
setvalues = {'famrel':{1: '1:very bad', 2:'2:bad', 3:'3:neutral',
    4:'4:good', 5:'5:excellent'},
    'freetime':{1: '1:very low', 2: '2:low', 3: '3:neutral',
    4: '4:high', 5: '5:very high'},
    'goout':{1: '1:very low', 2: '2:low', 3: '3:neutral',
    4: '4:high', 5: '5:very high'},
    'mothereducation':{0:np.nan, 1: '1:k-4', 2: '2:5-9',
    3: '3:secondary ed', 4: '4:higher ed'},
    'fathereducation':{0:np.nan, 1: '1:k-4', 2: '2:5-9',
    3: '3:secondary ed', 4: '4:higher ed'}}

studentmath.replace(setvalues, inplace=True)
1
setvalueskeys = [k for k in setvalues]
  • Check for changes in memory usage:
  • Change the type for columns with the changed data to category:
1
studentmath[setvalueskeys].memory_usage(index=False)
1
2
3
4
5
6
famrel             3160
freetime           3160
goout              3160
mothereducation    3160
fathereducation    3160
dtype: int64
1
2
for col in studentmath[setvalueskeys].columns:
    studentmath[col] = studentmath[col].astype('category')
1
studentmath[setvalueskeys].memory_usage(index=False)
1
2
3
4
5
6
famrel             607
freetime           607
goout              607
mothereducation    599
fathereducation    599
dtype: int64
1
studentmath['famrel'].value_counts(sort=False, normalize=True)
1
2
3
4
5
6
1:very bad    0.02
2:bad         0.05
3:neutral     0.17
4:good        0.49
5:excellent   0.27
Name: famrel, dtype: float64
1
2
studentmath[['freetime', 'goout']].\
    apply(pd.Series.value_counts, sort=False, normalize=True)
freetime goout
1:very low 0.05 0.06
2:low 0.16 0.26
3:neutral 0.40 0.33
4:high 0.29 0.22
5:very high 0.10 0.13
1
2
studentmath[['mothereducation', 'fathereducation']].\
    apply(pd.Series.value_counts, sort=False, normalize=True)
mothereducation fathereducation
1:k-4 0.15 0.21
2:5-9 0.26 0.29
3:secondary ed 0.25 0.25
4:higher ed 0.33 0.24
1