# YOUR NAME HERE: ________________________ # DATE and EMAIL: ________________________ # HW 4 "Who Said It?" # https://sites.pitt.edu/~naraehan/ling1330/hw4.html ### Some STEPs are already COMPLETE. ### Commands you need to EDIT are marked as such. ### <-- They are shown as empty lists/None object/0.0, etc. ### <-- so that the script can be run without breaking. import nltk, random #------------------------------------------------ STEP 1 (COMPLETE) print("1. Loading Austen and Melville sentences...") a_sents_all = nltk.corpus.gutenberg.sents('austen-emma.txt') m_sents_all = nltk.corpus.gutenberg.sents('melville-moby_dick.txt') #------------------------------------------------ STEP 2 print("2. Discarding short sentences and labeling...") a_sents = [(s, 'austen') for s in a_sents_all if len(s)>2] m_sents = [] # EDIT #------------------------------------------------ STEP 3 print("3. Joining the two author sentence lists...") sents = [] # EDIT #------------------------------------------------ STEP 4 (COMPLETE) print("4. Sentence stats:") print(" # of total sentences:", len(sents)) print(" # of Austen sentences:", len(a_sents)) print(" # of Melville sentences:", len(m_sents)) #------------------------------------------------ STEP 5 print("5. Shuffling...") # EDIT -- shuffle sents here #------------------------------------------------ STEP 6 print("6. Partitioning...") test_sents = [] # EDIT devtest_sents = [] # EDIT train_sents = [] # EDIT print(" # of test sentences:", len(test_sents)) print(" # of devtest sentences:", len(devtest_sents)) print(" # of training sentences:", len(train_sents)) #------------------------------------------------ STEP 7 (COMPLETE) print("7. Defining a feature-generator function...") mainchars = {'Emma', 'Harriet', 'Ahab', 'Weston', 'Knightley', 'Elton', 'Woodhouse', 'Jane', 'Stubb', 'Queequeg', 'Fairfax', 'Churchill', 'Frank', 'Starbuck', 'Pequod', 'Hartfield', 'Bates', 'Highbury', 'Perry', 'Bildad', 'Peleg', 'Pip', 'Cole', 'Goddard', 'Campbell', 'Donwell', 'Dixon', 'Taylor', 'Tashtego'} noCharNames = False # For [PART B] Q3 if noCharNames : print('NOTE: Top 35 proper nouns have been neutralized.') def gen_feats(sent): featdict = {} for w in sent: if noCharNames == True: if w in mainchars: w = 'MontyPython' featdict['contains-'+w.lower()] = 1 return featdict #------------------------------------------------ STEP 8 print("8. Generating feature sets...") test_feats = [] # EDIT devtest_feats = [] # EDIT train_feats = [] # EDIT #------------------------------------------------ STEP 9 print("9. Training...") whosaid = None # EDIT #------------------------------------------------ STEP 10 print("10. Testing...") accuracy = 0.0 # EDIT print(" Accuracy score:", accuracy) #------------------------------------------------ STEP 11 print("11. Sub-dividing development testing set...") # aa: real author Austen, guessed Austen # mm: real author Melville, guessed Melville # am: real author Austen, guessed Melville # ma: real author Melville, guessed Austen aa, mm, am, ma = [], [], [], [] for (sent, auth) in devtest_sents: guess = whosaid.classify(gen_feats(sent)) if auth == 'austen' and guess == 'austen': aa.append( (auth, guess, sent) ) # EDIT below to populate mm, am, ma #------------------------------------------------ STEP 12 print("12. Sample CORRECT and INCORRECT predictions from dev-test set:") print("-------") for x in (aa): # EDIT change (aa) to (aa, mm, am, ma) auth, guess, sent = random.choice(x) print('REAL=%-8s GUESS=%-8s' % (auth, guess)) # string formatting print(' '.join(sent)) print("-------") print() #------------------------------------------------ STEP 13 print("13. Looking up most informative features...") # EDIT -- try .show_most_informative_features(40) first, # and then .show_most_informative_feats_all(40)