In [4]:
from __future__ import division

Load text data

In [6]:
text = open("text.txt").read()
In [10]:
# SLICING
text[0:10]
Out[10]:
'Clinton La'
In [11]:
text.lower()
Out[11]:
'clinton lays into trump for his praise of russia\xe2\x80\x99s leader\nby amy chozick and jonathan martin 12:28 pm et\nhillary clinton seized on donald j. trump\xe2\x80\x99s assertion at a forum on wednesday night that vladimir v. putin was a better leader than president obama.\nshe said that mr. trump\xe2\x80\x99s comments showed that, if elected, he would be little more than a tool of mr. putin.\naccused of insufficient smiling, clinton says she will stay serious on issues 11:38 am et\n'
In [12]:
text.split()
Out[12]:
['Clinton',
 'Lays',
 'Into',
 'Trump',
 'for',
 'His',
 'Praise',
 'of',
 'Russia\xe2\x80\x99s',
 'Leader',
 'By',
 'AMY',
 'CHOZICK',
 'and',
 'JONATHAN',
 'MARTIN',
 '12:28',
 'PM',
 'ET',
 'Hillary',
 'Clinton',
 'seized',
 'on',
 'Donald',
 'J.',
 'Trump\xe2\x80\x99s',
 'assertion',
 'at',
 'a',
 'forum',
 'on',
 'Wednesday',
 'night',
 'that',
 'Vladimir',
 'V.',
 'Putin',
 'was',
 'a',
 'better',
 'leader',
 'than',
 'President',
 'Obama.',
 'She',
 'said',
 'that',
 'Mr.',
 'Trump\xe2\x80\x99s',
 'comments',
 'showed',
 'that,',
 'if',
 'elected,',
 'he',
 'would',
 'be',
 'little',
 'more',
 'than',
 'a',
 'tool',
 'of',
 'Mr.',
 'Putin.',
 'Accused',
 'of',
 'Insufficient',
 'Smiling,',
 'Clinton',
 'Says',
 'She',
 'Will',
 'Stay',
 'Serious',
 'on',
 'Issues',
 '11:38',
 'AM',
 'ET']

Basic processing: lists

In [13]:
tokens = text.split()
In [16]:
tokens[0:5]
Out[16]:
['Clinton', 'Lays', 'Into', 'Trump', 'for']
In [17]:
tokens[-1]
Out[17]:
'ET'

Basic counting: dictionaries

In [18]:
d = {}
d['asdf'] = 5
In [19]:
d
Out[19]:
{'asdf': 5}
In [20]:
d['qwerqwereqw'] = 10.45
In [21]:
d
Out[21]:
{'asdf': 5, 'qwerqwereqw': 10.45}
In [22]:
('asdf','qwer','zxcv')
Out[22]:
('asdf', 'qwer', 'zxcv')
In [24]:
d[ ('a','b') ] = 1000
In [25]:
d
Out[25]:
{'asdf': 5, 'qwerqwereqw': 10.45, ('a', 'b'): 1000}
In [27]:
d[('a','b')] = 1200
In [28]:
d
Out[28]:
{'asdf': 5, 'qwerqwereqw': 10.45, ('a', 'b'): 1200}
In [29]:
None

Counting with dictionaries

In [31]:
text = open("new.txt").read()
tokens = text.split()
len(tokens)
Out[31]:
565
In [32]:
#from collections import Counter
In [36]:
# Counting with a simple dict
counts = {}
for w in tokens:
    if w not in counts:
        counts[w] = 0
    counts[w] = counts[w] + 1
In [38]:
counts = {}
for w in tokens:
    if w not in counts:
        counts[w] = 0
    counts[w] += 1

Plotting

In [39]:
import matplotlib.pyplot as plt
%matplotlib inline
In [41]:
plt.plot( [1,2,4], [100, 150, 800] )
Out[41]:
[<matplotlib.lines.Line2D at 0x106b08510>]
In [42]:
plt.scatter( [1,2,4], [100,150,800] )
Out[42]:
<matplotlib.collections.PathCollection at 0x106aad850>
In [44]:
re.split('[ ,]', text)
Out[44]:
['CHARLOTTE',
 '',
 'N.C.',
 '\xe2\x80\x94',
 'Hillary',
 'Clinton',
 'excoriated',
 'Donald',
 'J.',
 'Trump',
 'on',
 'Thursday',
 'for',
 'asserting',
 'that',
 'the',
 'Russian',
 'president',
 '',
 'Vladimir',
 'V.',
 'Putin',
 '',
 'was',
 'a',
 'better',
 'leader',
 'than',
 'President',
 'Obama',
 '',
 'saying',
 'Mr.',
 'Trump\xe2\x80\x99s',
 'praise',
 'for',
 'the',
 'authoritarian',
 'leader',
 'of',
 'an',
 'adversarial',
 'power',
 '\xe2\x80\x9cis',
 'not',
 'just',
 'unpatriotic',
 'and',
 'insulting',
 'to',
 'the',
 'people',
 'of',
 'our',
 'country',
 '',
 'as',
 'well',
 'as',
 'to',
 'our',
 'commander',
 'in',
 'chief',
 '',
 'it',
 'is',
 'scary.\xe2\x80\x9d\n\nMrs.',
 'Clinton',
 '',
 'the',
 'Democratic',
 'presidential',
 'candidate',
 '',
 'seized',
 'on',
 'Mr.',
 'Trump\xe2\x80\x99s',
 'assertion',
 'in',
 'a',
 'televised',
 'forum',
 'Wednesday',
 'that',
 'Mr.',
 'Putin\xe2\x80\x99s',
 'incursions',
 'into',
 'neighboring',
 'countries',
 '',
 'crackdown',
 'on',
 'Russia\xe2\x80\x99s',
 'independent',
 'press',
 'and',
 'support',
 'for',
 'America\xe2\x80\x99s',
 'enemies',
 'were',
 'no',
 'less',
 'troublesome',
 'than',
 'Mr.',
 'Obama\xe2\x80\x99s',
 'transgressions.',
 'She',
 'said',
 'it',
 'showed',
 'that',
 'if',
 'elected',
 '',
 'her',
 'Republican',
 'rival',
 'would',
 'be',
 'little',
 'more',
 'than',
 'a',
 'tool',
 'of',
 'Mr.',
 'Putin.\n\n\xe2\x80\x9cIt',
 'suggests',
 'he',
 'will',
 'let',
 'Putin',
 'do',
 'whatever',
 'Putin',
 'wants',
 'to',
 'do',
 '',
 'and',
 'then',
 'make',
 'excuses',
 'for',
 'him',
 '\xe2\x80\x9d',
 'Mrs.',
 'Clinton',
 'told',
 'reporters',
 'in',
 'New',
 'York',
 'on',
 'Thursday',
 'morning',
 'at',
 'the',
 'White',
 'Plains',
 'airport',
 '',
 'ratcheting',
 'up',
 'her',
 'oratory',
 'as',
 'polls',
 'indicate',
 'the',
 'race',
 'has',
 'tightened',
 '\xe2\x80\x94',
 'and',
 'as',
 'Mr.',
 'Trump',
 'continues',
 'to',
 'say',
 'things',
 'rarely',
 'heard',
 'before',
 'from',
 'a',
 'major',
 'party',
 'presidential',
 'nominee.\n\nIn',
 'the',
 'Wednesday',
 'night',
 'forum',
 'on',
 'NBC',
 'and',
 'MSNBC',
 '',
 'which',
 'was',
 'devoted',
 'to',
 'national',
 'security',
 'issues',
 '',
 'Mr.',
 'Trump',
 'twice',
 'denigrated',
 'America\xe2\x80\x99s',
 'generals',
 '',
 'suggested',
 'he',
 'would',
 'fire',
 'the',
 'country\xe2\x80\x99s',
 'current',
 'military',
 'leadership',
 'and',
 'claimed',
 '',
 'without',
 'offering',
 'evidence',
 '',
 'that',
 'the',
 'intelligence',
 'officials',
 'who',
 'recently',
 'gave',
 'him',
 'a',
 'classified',
 'briefing',
 'about',
 'threats',
 'to',
 'the',
 'United',
 'States',
 'were',
 'not',
 'pleased',
 'with',
 'Mr.',
 'Obama.\n\nIn',
 'a',
 'news',
 'conference',
 'before',
 'boarding',
 'her',
 'campaign',
 'plane',
 '',
 'Mrs.',
 'Clinton',
 'appeared',
 'incredulous',
 'at',
 'times',
 'as',
 'she',
 'remarked',
 'upon',
 'Mr.',
 'Trump\xe2\x80\x99s',
 'statements',
 'the',
 'night',
 'before',
 '',
 'particularly',
 'about',
 'Mr.',
 'Putin.\n\nIn',
 'the',
 'forum',
 '',
 'Mr.',
 'Trump',
 'said',
 'of',
 'Mr.',
 'Putin',
 'that',
 'he',
 'had',
 'been',
 'a',
 'leader',
 '\xe2\x80\x9cfar',
 'more',
 'than',
 'our',
 'president.\xe2\x80\x9d\n\nAnd',
 'after',
 'Matt',
 'Lauer',
 '',
 'the',
 'NBC',
 'anchor',
 'who',
 'moderated',
 'the',
 'event',
 '',
 'highlighted',
 'Mr.',
 'Putin\xe2\x80\x99s',
 'record',
 '',
 'Mr.',
 'Trump',
 'shot',
 'back',
 '',
 '\xe2\x80\x9cBut',
 'do',
 'you',
 'want',
 'me',
 'to',
 'start',
 'naming',
 'some',
 'of',
 'the',
 'things',
 'that',
 'President',
 'Obama',
 'does',
 'at',
 'the',
 'same',
 'time?\xe2\x80\x9d\n\nContinue',
 'reading',
 'the',
 'main',
 'story\n\nPresidential',
 'Election',
 '2016\nThe',
 'latest',
 'news',
 'and',
 'analysis',
 'of',
 'the',
 'candidates',
 'and',
 'issues',
 'shaping',
 'the',
 'presidential',
 'race.\nWho',
 'Is',
 'Winning',
 'in',
 'West',
 'Virginia?\nSEP',
 '8\nDonald',
 'Trump',
 'vs.',
 'Hillary',
 'Clinton:',
 'Where',
 'They',
 'Stand',
 'on',
 'Education\nSEP',
 '8\nNot',
 'Smiling',
 'Enough?',
 'Hillary',
 'Clinton',
 'Says',
 'She',
 'Will',
 'Stay',
 'Serious',
 'on',
 'Vital',
 'Issues\nSEP',
 '8\nWho',
 'Is',
 'Winning',
 'in',
 'Alabama?\nSEP',
 '8\nMatt',
 'Lauer',
 'Loses',
 'the',
 'War',
 'in',
 'a',
 'Battle',
 'Between',
 'the',
 'Candidates\nSEP',
 '8\nSee',
 'More',
 '\xc2\xbb\n\nAdvertisement\n\nContinue',
 'reading',
 'the',
 'main',
 'story\nIn',
 'her',
 'news',
 'conference',
 'Thursday',
 '',
 'Mrs.',
 'Clinton',
 'asked',
 '',
 '\xe2\x80\x9cWhat',
 'would',
 'Ronald',
 'Reagan',
 'say',
 'about',
 'a',
 'Republican',
 'nominee',
 'who',
 'attacks',
 'American',
 'generals',
 'and',
 'heaps',
 'praise',
 'on',
 'Russia\xe2\x80\x99s',
 'president?\xe2\x80\x9d\n\nMrs.',
 'Clinton',
 'was',
 'also',
 'withering',
 'in',
 'referring',
 'to',
 'Mr.',
 'Trump\xe2\x80\x99s',
 'assertion',
 'that',
 'the',
 'United',
 'States',
 'made',
 'a',
 'mistake',
 'by',
 'not',
 'seizing',
 'oil',
 'fields',
 'in',
 'Iraq',
 'and',
 'Libya',
 'after',
 'invading',
 'the',
 'countries.\n\n\xe2\x80\x9cThe',
 'United',
 'States',
 'of',
 'America',
 'does',
 'not',
 'invade',
 'other',
 'countries',
 'to',
 'plunder',
 'and',
 'pillage',
 '\xe2\x80\x9d',
 'she',
 'said.',
 '\xe2\x80\x9cWe',
 'don\xe2\x80\x99t',
 'send',
 'our',
 'brave',
 'men',
 'and',
 'women',
 'around',
 'the',
 'world',
 'to',
 'steal',
 'oil.',
 'And',
 'that\xe2\x80\x99s',
 'not',
 'even',
 'getting',
 'into',
 'the',
 'absurdity',
 'of',
 'what',
 'is',
 'involved.\xe2\x80\x9d\n\nMrs.',
 'Clinton',
 '',
 'who',
 'had',
 'faced',
 'a',
 'barrage',
 'of',
 'questions',
 'in',
 'her',
 'half',
 'of',
 'the',
 'forum',
 'about',
 'her',
 'use',
 'of',
 'a',
 'private',
 'email',
 'server',
 'as',
 'secretary',
 'of',
 'state',
 '',
 'also',
 'used',
 'the',
 'news',
 'conference',
 'to',
 'try',
 'to',
 'drive',
 'a',
 'wedge',
 'between',
 'Mr.',
 'Trump',
 'and',
 'the',
 'leadership',
 'of',
 'his',
 'party.\n\n\n']