JSON

1
2
3
4
5
import pandas as pd
import numpy as np
import json
import pprint
from collections import Counter
1
2
with open('allcandidatenewssample.json') as f:
    candidatenews = json.load(f)
1
len(candidatenews)
1
60000
1
pprint.pprint(candidatenews[0:2])
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
[{'date': '2019-12-25 10:00:00',
  'domain': 'www.nbcnews.com',
  'panel_position': 1,
  'query': 'Michael Bloomberg',
  'source': 'NBC News',
  'story_position': 6,
  'time': '18 hours ago',
  'title': 'Bloomberg cuts ties with company using prison inmates to make '
           'campaign calls',
  'url': 'https://www.nbcnews.com/politics/2020-election/bloomberg-cuts-ties-company-using-prison-inmates-make-campaign-calls-n1106971'},
 {'date': '2019-11-09 08:00:00',
  'domain': 'www.townandcountrymag.com',
  'panel_position': 1,
  'query': 'Amy Klobuchar',
  'source': 'Town & Country Magazine',
  'story_position': 3,
  'time': '18 hours ago',
  'title': "Democratic Candidates React to Michael Bloomberg's Potential Run",
  'url': 'https://www.townandcountrymag.com/society/politics/a29739854/michael-bloomberg-democratic-candidates-campaign-reactions/'}]
1
Counter([len(item) for item in candidatenews])
1
Counter({9: 57202, 2: 2382, 10: 416})
1
pprint.pprint(next(item for item in candidatenews if len(item) > 9))
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
{'category': 'Satire',
 'date': '2019-08-21 04:00:00',
 'domain': 'politics.theonion.com',
 'panel_position': 1,
 'query': 'John Hickenlooper',
 'source': 'Politics | The Onion',
 'story_position': 8,
 'time': '4 days ago',
 'title': '‘And Then There Were 23,’ Says Wayne Messam Crossing Out '
          'Hickenlooper Photo \n'
          'In Elaborate Grid Of Rivals',
 'url': 'https://politics.theonion.com/and-then-there-were-23-says-wayne-messam-crossing-ou-1837311060'}
1
pprint.pprint(next(item for item in candidatenews if len(item) < 9))
1
{'date': '2019-09-11 18:00:00', 'reason': 'Not collected'}
1
pprint.pprint([item for item in candidatenews if len(item) == 2][0:10])
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
[{'date': '2019-09-11 18:00:00', 'reason': 'Not collected'},
 {'date': '2019-07-24 00:00:00', 'reason': 'No Top stories'},
 {'date': '2019-08-19 20:00:00', 'reason': 'Not collected'},
 {'date': '2019-09-13 16:00:00', 'reason': 'Not collected'},
 {'date': '2019-10-16 20:00:00', 'reason': 'No Top stories'},
 {'date': '2019-10-17 18:00:00', 'reason': 'Not collected'},
 {'date': '2019-08-02 14:00:00', 'reason': 'Not collected'},
 {'date': '2019-05-27 12:00:00', 'reason': 'Not collected'},
 {'date': '2019-12-03 12:00:00', 'reason': 'No Top stories'},
 {'date': '2019-01-03 00:00:00', 'reason': 'No Top stories'}]
1
candidatenews = [item for item in candidatenews if len(item) > 2]
1
len(candidatenews)
1
57618
1
2
politico = [item for item in candidatenews if item['source']=='Politico']
len(politico)
1
2732
1
pprint.pprint(politico[0:2])
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
[{'date': '2019-05-18 18:00:00',
  'domain': 'www.politico.com',
  'panel_position': 1,
  'query': 'Marianne Williamson',
  'source': 'Politico',
  'story_position': 7,
  'time': '1 week ago',
  'title': 'Marianne Williamson reaches donor threshold for Dem debates',
  'url': 'https://www.politico.com/story/2019/05/09/marianne-williamson-2020-election-1315133'},
 {'date': '2018-12-27 06:00:00',
  'domain': 'www.politico.com',
  'panel_position': 1,
  'query': 'Julian Castro',
  'source': 'Politico',
  'story_position': 1,
  'time': '1 hour ago',
  'title': "O'Rourke and Castro on collision course in Texas",
  'url': 'https://www.politico.com/story/2018/12/27/orourke-julian-castro-collision-texas-election-1073720'}]
1
2
sources = [item.get('source') for item in candidatenews]
type(sources), len(sources)
1
(list, 57618)
1
sources[0:5]
1
['NBC News', 'Town & Country Magazine', 'TheHill', 'CNBC.com', 'Fox News']
1
pprint.pprint(Counter(sources).most_common((10)))
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
[('Fox News', 3530),
 ('CNN.com', 2750),
 ('Politico', 2732),
 ('TheHill', 2383),
 ('The New York Times', 1804),
 ('Washington Post', 1770),
 ('Washington Examiner', 1655),
 ('The Hill', 1342),
 ('New York Post', 1275),
 ('Vox', 941)]
1
2
3
4
5
6
for newsdict in candidatenews:
    newsdict.update((k, 'The Hill') for k, v in newsdict.items() \
        if k == 'source' and v == 'TheHill')

sources = [item.get('source') for item in candidatenews]
pprint.pprint(Counter(sources).most_common(10))
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
[('The Hill', 3725),
 ('Fox News', 3530),
 ('CNN.com', 2750),
 ('Politico', 2732),
 ('The New York Times', 1804),
 ('Washington Post', 1770),
 ('Washington Examiner', 1655),
 ('New York Post', 1275),
 ('Vox', 941),
 ('Breitbart', 799)]
1
candidatenewsdf = pd.DataFrame(candidatenews)
1
candidatenewsdf.dtypes
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
title             object
url               object
source            object
time              object
date              object
query             object
story_position     int64
panel_position    object
domain            object
category          object
dtype: object
1
candidatenewsdf.rename(columns={'date': 'storydate'}, inplace=True)
1
candidatenewsdf.storydate = candidatenewsdf.storydate.astype('datetime64[ns]')
1
candidatenewsdf.shape
1
(57618, 10)
1
candidatenewsdf.source.value_counts(sort=True).head(10)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
The Hill               3725
Fox News               3530
CNN.com                2750
Politico               2732
The New York Times     1804
Washington Post        1770
Washington Examiner    1655
New York Post          1275
Vox                     941
Breitbart               799
Name: source, dtype: int64
1