In [1]:
# Taking care of jupyter environment 
# show graphs in-line, and turn on/off pretty_printing of lists
%matplotlib inline 
%pprint           
Pretty printing has been turned OFF
In [2]:
import plotly
import plotly.graph_objs as go
import numpy as np   # So we can use random numbers in examples

# Must enable in order to use plotly off-line (vs. in the cloud... hate cloud)
plotly.offline.init_notebook_mode()
In [3]:
N = 1000
random_x = np.random.randn(N)
random_y = np.random.randn(N)

# Create a trace
trace = go.Scatter(
    x = random_x,
    y = random_y,
    mode = 'markers'
)

data = [trace]

# Plot and embed in ipython notebook!
plotly.offline.iplot(data, filename='basic-scatter')
In [4]:
trace0 = go.Scatter(
    x=[1, 2, 3, 4],
    y=[10, 15, 13, 17]
)
trace1 = go.Scatter(
    x=[1, 2, 3, 4],
    y=[16, 5, 11, 9]
)
data = go.Data([trace0, trace1])

plotly.offline.iplot(data, filename = 'basic-line')

Preparing data

Let's use NLTK's inaugural speech corpus. We will look at the use of 'America', 'Citizen' over the years.

In [5]:
import nltk
from nltk.corpus import inaugural
inaugural.fileids()
Out[5]:
['1789-Washington.txt', '1793-Washington.txt', '1797-Adams.txt', '1801-Jefferson.txt', '1805-Jefferson.txt', '1809-Madison.txt', '1813-Madison.txt', '1817-Monroe.txt', '1821-Monroe.txt', '1825-Adams.txt', '1829-Jackson.txt', '1833-Jackson.txt', '1837-VanBuren.txt', '1841-Harrison.txt', '1845-Polk.txt', '1849-Taylor.txt', '1853-Pierce.txt', '1857-Buchanan.txt', '1861-Lincoln.txt', '1865-Lincoln.txt', '1869-Grant.txt', '1873-Grant.txt', '1877-Hayes.txt', '1881-Garfield.txt', '1885-Cleveland.txt', '1889-Harrison.txt', '1893-Cleveland.txt', '1897-McKinley.txt', '1901-McKinley.txt', '1905-Roosevelt.txt', '1909-Taft.txt', '1913-Wilson.txt', '1917-Wilson.txt', '1921-Harding.txt', '1925-Coolidge.txt', '1929-Hoover.txt', '1933-Roosevelt.txt', '1937-Roosevelt.txt', '1941-Roosevelt.txt', '1945-Roosevelt.txt', '1949-Truman.txt', '1953-Eisenhower.txt', '1957-Eisenhower.txt', '1961-Kennedy.txt', '1965-Johnson.txt', '1969-Nixon.txt', '1973-Nixon.txt', '1977-Carter.txt', '1981-Reagan.txt', '1985-Reagan.txt', '1989-Bush.txt', '1993-Clinton.txt', '1997-Clinton.txt', '2001-Bush.txt', '2005-Bush.txt', '2009-Obama.txt']
In [6]:
for x in inaugural.fileids()[:10]:  # remove [:10] to see all
    print(x, len(inaugural.words(x)))    
1789-Washington.txt 1538
1793-Washington.txt 147
1797-Adams.txt 2585
1801-Jefferson.txt 1935
1805-Jefferson.txt 2384
1809-Madison.txt 1265
1813-Madison.txt 1304
1817-Monroe.txt 3693
1821-Monroe.txt 4909
1825-Adams.txt 3150
In [7]:
foo = {f:len(inaugural.words(f)) for f in inaugural.fileids()}
for x in sorted(foo, key=foo.get, reverse=True)[:10]:    # remove [:10] to see all
    print(x, foo[x])    
1841-Harrison.txt 9165
1909-Taft.txt 5846
1845-Polk.txt 5196
1821-Monroe.txt 4909
1889-Harrison.txt 4750
1925-Coolidge.txt 4442
1897-McKinley.txt 4371
1837-VanBuren.txt 4171
1861-Lincoln.txt 4005
1929-Hoover.txt 3890
In [8]:
# dictionary. keys: year, value: (president, tokenized_word) tuple
speeches = {}
for x in inaugural.fileids():
    year = int(x[:4])
    pres = x[5:-4]
    #print(year, pres)
    speech = inaugural.words(x)
    speeches[year] = (pres, speech)
    
speeches[1989]
Out[8]:
('Bush', ['Mr', '.', 'Chief', 'Justice', ',', 'Mr', '.', ...])
In [9]:
# creating (year, word) tuples for all speeches
# as preparation for conditional frequency dictionary 
year2word = [(year, word) for year in speeches for word in speeches[year][1]]
year2word[-10:]
Out[9]:
[(2009, '.'), (2009, 'And'), (2009, 'God'), (2009, 'bless'), (2009, 'the'), (2009, 'United'), (2009, 'States'), (2009, 'of'), (2009, 'America'), (2009, '.')]
In [10]:
# build a condifitional frequency dictionary. 
year2word_cfd = nltk.ConditionalFreqDist(year2word)
year2word_cfd[2009]['America']  # number of mention in Obama's speech
Out[10]:
10
In [11]:
years = sorted(year2word_cfd)
years
Out[11]:
[1789, 1793, 1797, 1801, 1805, 1809, 1813, 1817, 1821, 1825, 1829, 1833, 1837, 1841, 1845, 1849, 1853, 1857, 1861, 1865, 1869, 1873, 1877, 1881, 1885, 1889, 1893, 1897, 1901, 1905, 1909, 1913, 1917, 1921, 1925, 1929, 1933, 1937, 1941, 1945, 1949, 1953, 1957, 1961, 1965, 1969, 1973, 1977, 1981, 1985, 1989, 1993, 1997, 2001, 2005, 2009]
In [12]:
presidents = [speeches[y][0] for y in years]
presidents
Out[12]:
['Washington', 'Washington', 'Adams', 'Jefferson', 'Jefferson', 'Madison', 'Madison', 'Monroe', 'Monroe', 'Adams', 'Jackson', 'Jackson', 'VanBuren', 'Harrison', 'Polk', 'Taylor', 'Pierce', 'Buchanan', 'Lincoln', 'Lincoln', 'Grant', 'Grant', 'Hayes', 'Garfield', 'Cleveland', 'Harrison', 'Cleveland', 'McKinley', 'McKinley', 'Roosevelt', 'Taft', 'Wilson', 'Wilson', 'Harding', 'Coolidge', 'Hoover', 'Roosevelt', 'Roosevelt', 'Roosevelt', 'Roosevelt', 'Truman', 'Eisenhower', 'Eisenhower', 'Kennedy', 'Johnson', 'Nixon', 'Nixon', 'Carter', 'Reagan', 'Reagan', 'Bush', 'Clinton', 'Clinton', 'Bush', 'Bush', 'Obama']
In [13]:
america_count = [year2word_cfd[year]['America'] 
                 + year2word_cfd[year]['American'] 
                 + year2word_cfd[year]['Americans'] 
                 for year in years]
america_count[:20]
Out[13]:
[2, 1, 8, 0, 1, 0, 1, 1, 2, 0, 0, 2, 2, 7, 0, 2, 2, 3, 2, 1]
In [14]:
citizen_count = [year2word_cfd[year]['citizens'] 
                 + year2word_cfd[year]['citizen'] 
                 + year2word_cfd[year]['Citizens']
                 + year2word_cfd[year]['Citizen'] 
                 for year in years]
citizen_count[:20]
Out[14]:
[5, 1, 6, 7, 10, 1, 4, 14, 15, 3, 2, 3, 7, 38, 11, 2, 3, 7, 7, 0]

Plotting time!

In [15]:
trace0 = go.Scatter(x=years, y=america_count, 
                    name="America", 
                    mode="lines+markers")  
# mode can be 'markers', 'lines+markers', 'lines'

trace1 = go.Scatter(x=years, y=citizen_count, 
                    name="Citizen", 
                    mode="lines+markers")  

mydata = go.Data([trace0, trace1])

mylayout = go.Layout(
    title="Frequency of 'America' vs. 'citizen' in inaugural speeches"
)

fig = go.Figure(data=mydata, layout=mylayout)

plotly.offline.iplot(fig, filename = 'inaugural-america-vs-citizen')

Table using pandas module

In [16]:
from plotly.tools import FigureFactory as ff
import pandas as pd
In [17]:
file = "https://raw.githubusercontent.com/plotly/datasets/master/school_earnings.csv"
df = pd.read_csv(file)
table = ff.create_table(df)
plotly.offline.iplot(table, filename='jupyter/table1')
In [18]:
type(df)
Out[18]:
<class 'pandas.core.frame.DataFrame'>
In [19]:
myitems = [('a',[1,2,3,4]), ('b',[2,8,30,40]), ('c',[14,3,7,20])]
df = pd.DataFrame.from_items(myitems)  # pandas' data frame
table = ff.create_table(df)            # create a table off of data frame! 
plotly.offline.iplot(table)

Turning our presidential data into a table

In [20]:
# Create a vector
time = ('Year', years)
pres = ('President', presidents)
amer = ('America', america_count)
citi = ('Citizen', citizen_count)
inaug_items = [time, pres, amer, citi]

df_inaug = pd.DataFrame.from_items(inaug_items)
table_inaug = ff.create_table(df_inaug)
plotly.offline.iplot(table_inaug, filename='jupyter/table_inaug')
In [21]:
trace_a = go.Bar(x=df_inaug.Year,
                y=df_inaug.America,
                name='America',
                marker=dict(color='#A2D5F2'))

trace_b = go.Bar(x=df_inaug.Year,
                y=df_inaug.Citizen,
                name='Citizen',
                marker=dict(color='#FFCDD2'))

data3 = go.Data([trace_a, trace_b])

#data3 = [go.Bar(x=df_inaug.Year, y=df_inaug.America)]

plotly.offline.iplot(data3, filename='jupyter/basic_bar')