Python 3.11.4 (tags/v3.11.4:d2340ef, Jun 7 2023, 05:45:37) [MSC v.1934 64 bit (AMD64)] on win32 Type "help", "copyright", "credits" or "license()" for more information. = RESTART: C:\Users\Jane Eyre\Documents\ling1330\HW2_bible_austen_bigrams.KEY.py ### Ran my HW2 script in order to create Bible and Austen data objects ### Script output clipped. >>> dir() ['__annotations__', '__builtins__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', 'a_bigramcfd', 'a_bigramfd', 'a_bigrams', 'a_etxt', 'a_ptxt', 'a_stxt', 'a_tokfd', 'a_toks', 'a_txt', 'b_bigramcfd', 'b_bigramfd', 'b_bigrams', 'b_sograms', 'b_tokfd', 'b_toks', 'b_txt', 'c', 'count', 'f', 'gram', 'nltk', 'observ_e1', 'observ_e2', 'observ_e3', 'observ_e4', 'observ_e5', 'observ_e6', 'pickle', 'w', 'w2'] >>> sent = 'she was not afraid .'.split() >>> sent ['she', 'was', 'not', 'afraid', '.'] >>> b_tokfd['she'] 982 >>> b_tokfd.freq('she') 0.001037164716965987 >>> b_tokfd.freq('was') 0.004776027342281256 >>> b_tokfd.freq('not') 0.007160872485773311 >>> b_tokfd.freq('afraid') 0.00020384194539148216 >>> b_tokfd.freq('.') 0.02767392048263013 >>> sent ['she', 'was', 'not', 'afraid', '.'] >>> [w for w in sent] ['she', 'was', 'not', 'afraid', '.'] >>> [b_tokfd.freq(w) for w in sent] [0.001037164716965987, 0.004776027342281256, 0.007160872485773311, 0.00020384194539148216, 0.02767392048263013] ### Using numpy to easily multiply a list of probabilities (= "product") ### Unigram-based probability estimation for "she was not afraid ." >>> import numpy >>> numpy.prod([1,2,3,4,5]) 120 >>> [b_tokfd.freq(w) for w in sent] [0.001037164716965987, 0.004776027342281256, 0.007160872485773311, 0.00020384194539148216, 0.02767392048263013] >>> probs = [b_tokfd.freq(w) for w in sent] >>> numpy.prod(probs) 2.0009891005865551e-13 >>> probs = [a_tokfd.freq(w) for w in sent] >>> probs [0.011819179315160331, 0.012976739762317347, 0.010656979347173023, 0.00023893532275986537, 0.03130980632320294] >>> numpy.prod(probs) 1.2227784612297631e-11 ### Bigram-based probability estimation for "she was not afraid ." >>> sent ['she', 'was', 'not', 'afraid', '.'] >>> a_bigramcfd['she']['was'] 701 >>> a_bigramcfd['she'].freq('was') 0.13758586849852797 >>> [a_tokfd.freq('she'), a_bigramcfd['she'].freq('was'), a_bigramcfd['was'].freq('not'), a_bigramcfd['not'].freq('afraid'), a_bigramcfd['afraid'].freq('.') ] [0.011819124480086108, 0.13758586849852797, 0.0650697175545227, 0.00108837614279495, 0.02912621359223301] >>> probs = [a_tokfd.freq('she'), a_bigramcfd['she'].freq('was'), a_bigramcfd['was'].freq('not'), a_bigramcfd['not'].freq('afraid'), a_bigramcfd['afraid'].freq('.') ] >>> numpy.prod(probs) 3.3542938152700833e-09 ### Processing count_1w.txt from Norvig ### Confirming the downloaded file location first, so I will know how to refer to it ### Read in the content as a list of lines >>> import os >>> os.getcwd() 'C:\\Users\\Jane Eyre\\Documents\\ling1330' >>> os.listdir() ['austen_bigramcfd.pkl', 'bible_bigramcfd.pkl', 'count_1w.txt', 'enable1.txt', 'Ex4_emma_enable.py', 'gettysburg_address.txt', 'gift-of-magi.txt', 'gift_shell_explore.pdf', 'gutenberg', 'hello.py', 'HW2_bible_austen_bigrams.KEY.py', 'nltk_practice.txt', 'process_gift.py', 'process_gift_out.txt', 'words.pkl'] >>> f = open('count_1w.txt') >>> lines = f.readlines() >>> f.close() >>> lines[0] 'the\t23135851162\n' >>> lines[1] 'of\t13151942776\n' >>> lines[2] 'and\t12997637966\n' >>> len(lines) 333333 >>> lines[-1] 'golgw\t12711\n' ### Experiment first with a "mini" version >>> mini = lines[:5] >>> mini ['the\t23135851162\n', 'of\t13151942776\n', 'and\t12997637966\n', 'to\t12136980858\n', 'a\t9081174698\n'] >>> for line in mini: ... print(line) ... the 23135851162 of 13151942776 and 12997637966 to 12136980858 a 9081174698 >>> for line in mini: ... print(line.split()) ... ['the', '23135851162'] ['of', '13151942776'] ['and', '12997637966'] ['to', '12136980858'] ['a', '9081174698'] >>> for line in mini: ... print(tuple(line.split())) ... ('the', '23135851162') ('of', '13151942776') ('and', '12997637966') ('to', '12136980858') ('a', '9081174698') >>> mini_rank = [] >>> for line in mini: ... tu = tuple(line.split()) ... mini_rank.append(tu) ... >>> mini_rank [('the', '23135851162'), ('of', '13151942776'), ('and', '12997637966'), ('to', '12136980858'), ('a', '9081174698')] >>> mini_rank = [] >>> for line in mini: ... (word, count) = line.split() ... mini_rank.append((word, int(count))) ... >>> mini_rank [('the', 23135851162), ('of', 13151942776), ('and', 12997637966), ('to', 12136980858), ('a', 9081174698)] ### Now ready to build a full ranked list >>> goog1w_rank = [] >>> for line in lines: ... (word, count) = line.split() ... goog1w_rank.append((word, int(count))) ... >>> goog1w_rank[:20] [('the', 23135851162), ('of', 13151942776), ('and', 12997637966), ('to', 12136980858), ('a', 9081174698), ('in', 8469404971), ('for', 5933321709), ('is', 4705743816), ('on', 3750423199), ('that', 3400031103), ('by', 3350048871), ('this', 3228469771), ('with', 3183110675), ('i', 3086225277), ('you', 2996181025), ('it', 2813163874), ('not', 2633487141), ('or', 2590739907), ('be', 2398724162), ('are', 2393614870)] >>> goog1w_rank[-20:] [('goooglo', 12711), ('gooogla', 12711), ('gooogd', 12711), ('gooofa', 12711), ('goooao', 12711), ('goollo', 12711), ('goolld', 12711), ('goolh', 12711), ('goolgee', 12711), ('googook', 12711), ('googllr', 12711), ('googlal', 12711), ('googgoo', 12711), ('googgol', 12711), ('goofel', 12711), ('gooek', 12711), ('gooddg', 12711), ('gooblle', 12711), ('gollgo', 12711), ('golgw', 12711)] >>> len(goog1w_rank) 333333 ### An ordered list of (word, count) pairs complete. ### Now to build a FreqDist version. ### First experimenting with the mini version of goog1w_rank: >>> mini_fd = nltk.FreqDist() # initialize an empty FreqDist object >>> mini_rank = goog1w_rank[:5] >>> mini_rank [('the', 23135851162), ('of', 13151942776), ('and', 12997637966), ('to', 12136980858), ('a', 9081174698)] >>> for (word, count) in mini_rank: ... print(word, count) ... the 23135851162 of 13151942776 and 12997637966 to 12136980858 a 9081174698 >>> for (word, count) in mini_rank: ... mini_fd[word] = count ... >>> mini_fd FreqDist({'the': 23135851162, 'of': 13151942776, 'and': 12997637966, 'to': 12136980858, 'a': 9081174698}) >>> mini_fd['the'] 23135851162 >>> mini_fd['a'] 9081174698 ### Now ready to build a full FreqDist >>> goog1w_fd = nltk.FreqDist() >>> for (word, count) in goog1w_rank: ... goog1w_fd[word] = count ... ... >>> goog1w_fd['the'] 23135851162 >>> goog1w_fd['platypus'] 565585 >>> goog1w_fd['penguin'] 5835109 >>> goog1w_fd['pittsburgh'] 19654781 >>> goog1w_fd['cleveland'] 19041185 >>> goog1w_fd['philadelphia'] 30179898 >>> goog1w_fd['philly'] 2102437 ### goog1w_fd is good for looking up words, ### goog1w_rank is good for looking up based on rank >>> goog1w_rank[0] ('the', 23135851162) >>> goog1w_rank[9] ('that', 3400031103) >>> goog1w_rank[99] ('find', 502043038) >>> goog1w_rank[999] ('entry', 80717798) >>> goog1w_rank[9999] ('poison', 5056083) ### Pickle the data, so we can re-use them later >>> import pickle >>> f = open('goog1w_rank.pkl', 'wb') >>> pickle.dump(goog1w_rank, f, -1) >>> f.close() >>> f = open('goog1w_fd.pkl', 'wb') >>> pickle.dump(goog1w_fd, f, -1) >>> f.close()