Python 3.11.4 (tags/v3.11.4:d2340ef, Jun 7 2023, 05:45:37) [MSC v.1934 64 bit (AMD64)] on win32 Type "help", "copyright", "credits" or "license()" for more information. >>> import nltk #-------------------------------------------- Exploring names "corpus" (dataset, really) >>> from nltk.corpus import names >>> names.fileids() ['female.txt', 'male.txt'] >>> dir(names) ['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_citation', '_encoding', '_fileids', '_get_root', '_license', '_readme', '_root', '_tagset', '_unload', 'abspath', 'abspaths', 'citation', 'encoding', 'ensure_loaded', 'fileids', 'license', 'open', 'raw', 'readme', 'root', 'words'] >>> names.raw('female.txt')[:100] 'Abagael\nAbagail\nAbbe\nAbbey\nAbbi\nAbbie\nAbby\nAbigael\nAbigail\nAbigale\nAbra\nAcacia\nAda\nAdah\nAdaline\nAdar' >>> print(names.raw('female.txt')[:100]) Abagael Abagail Abbe Abbey Abbi Abbie Abby Abigael Abigail Abigale Abra Acacia Ada Adah Adaline Adar >>> print(names.raw('male.txt')[:100]) Aamir Aaron Abbey Abbie Abbot Abbott Abby Abdel Abdul Abdulkarim Abdullah Abe Abel Abelard Abner Abr >>> names.words('female.txt')[:10] ['Abagael', 'Abagail', 'Abbe', 'Abbey', 'Abbi', 'Abbie', 'Abby', 'Abigael', 'Abigail', 'Abigale'] >>> names.words('female.txt')[-10:] ['Zonnya', 'Zora', 'Zorah', 'Zorana', 'Zorina', 'Zorine', 'Zsa Zsa', 'Zsazsa', 'Zulema', 'Zuzana'] #------------------------------------------------- Create two gendered name lists >>> fnames = names.words('female.txt') >>> mnames = names.words('male.txt') >>> len(fnames) 5001 >>> len(mnames) 2943 >>> 'Zack' in mnames True >>> 'Zack' in fnames False >>> 'Taylor' in mnames True >>> 'Taylor' in fnames # Wait, what? Gotta dig deeper False >>> print(names.readme()) Names Corpus, Version 1.3 (1994-03-29) Copyright (C) 1991 Mark Kantrowitz Additions by Bill Ross This corpus contains 5001 female names and 2943 male names, sorted alphabetically, one per line. You may use the lists of names for any purpose, so long as credit is given in any published work. You may also redistribute the list if you provide the recipients with a copy of this README file. The lists are not in the public domain (I retain the copyright on the lists) but are freely redistributable. If you have any additions to the lists of names, I would appreciate receiving them. Mark Kantrowitz http://www-2.cs.cmu.edu/afs/cs/project/ai-repository/ai/areas/nlp/corpora/names/ #----------------------------------------- Makes sense, data is from 1991... >>> in_both = [n for n in mnames if n in fnames] >>> len(in_both) 365 >>> in_both[:30] ['Abbey', 'Abbie', 'Abby', 'Addie', 'Adrian', 'Adrien', 'Ajay', 'Alex', 'Alexis', 'Alfie', 'Ali', 'Alix', 'Allie', 'Allyn', 'Andie', 'Andrea', 'Andy', 'Angel', 'Angie', 'Ariel', 'Ashley', 'Aubrey', 'Augustine', 'Austin', 'Averil', 'Barrie', 'Barry', 'Beau', 'Bennie', 'Benny'] >>> in_both[-30:] ['Timmy', 'Tobe', 'Tobie', 'Toby', 'Tommie', 'Tommy', 'Tony', 'Torey', 'Trace', 'Tracey', 'Tracie', 'Tracy', 'Val', 'Vale', 'Valentine', 'Van', 'Vin', 'Vinnie', 'Vinny', 'Virgie', 'Wallie', 'Wallis', 'Wally', 'Whitney', 'Willi', 'Willie', 'Willy', 'Winnie', 'Winny', 'Wynn'] #---------------------- What features will be useful? first char and last char >>> def gender_features(word): ... return {'firstchar': word[0], 'lastchar': word[-1]} ... >>> gender_features('William') {'firstchar': 'W', 'lastchar': 'm'} >>> gender_features('Na-Rae') {'firstchar': 'N', 'lastchar': 'e'} #----------------------------------- Merging into a single list >>> allnames = fnames + mnames >>> allnames[:10] ['Abagael', 'Abagail', 'Abbe', 'Abbey', 'Abbi', 'Abbie', 'Abby', 'Abigael', 'Abigail', 'Abigale'] >>> allnames[-10:] ['Zed', 'Zedekiah', 'Zeke', 'Zelig', 'Zerk', 'Zeus', 'Zippy', 'Zollie', 'Zolly', 'Zorro'] #--------------------------------- Problem: we lost gender information with each name #--------------------------------- Label each name with gender, THEN merge >>> fnames_labeled = [(n, 'female') for n in fnames] >>> mnames_labeled = [(n, 'male') for n in mnames] >>> allnames_labeled = fnames_labeled + mnames_labeled >>> allnames_labeled[:5] [('Abagael', 'female'), ('Abagail', 'female'), ('Abbe', 'female'), ('Abbey', 'female'), ('Abbi', 'female')] >>> allnames_labeled[-5:] [('Zeus', 'male'), ('Zippy', 'male'), ('Zollie', 'male'), ('Zolly', 'male'), ('Zorro', 'male')] #------------------------- Converting names into their feature representation >>> allnames_feats = [(gender_features(n),g) for (n,g) in allnames_labeled] >>> allnames_feats[:5] [({'firstchar': 'A', 'lastchar': 'l'}, 'female'), ({'firstchar': 'A', 'lastchar': 'l'}, 'female'), ({'firstchar': 'A', 'lastchar': 'e'}, 'female'), ({'firstchar': 'A', 'lastchar': 'y'}, 'female'), ({'firstchar': 'A', 'lastchar': 'i'}, 'female')] >>> allnames_feats[-5:] [({'firstchar': 'Z', 'lastchar': 's'}, 'male'), ({'firstchar': 'Z', 'lastchar': 'y'}, 'male'), ({'firstchar': 'Z', 'lastchar': 'e'}, 'male'), ({'firstchar': 'Z', 'lastchar': 'y'}, 'male'), ({'firstchar': 'Z', 'lastchar': 'o'}, 'male')] # -------------------------- Girl names front and boy names back --> must randomize! >>> import random >>> random.shuffle(allnames_feats) # shuffles list IN PLACE >>> allnames_feats[:5] [({'firstchar': 'F', 'lastchar': 'd'}, 'female'), ({'firstchar': 'G', 'lastchar': 't'}, 'female'), ({'firstchar': 'B', 'lastchar': 'l'}, 'female'), ({'firstchar': 'C', 'lastchar': 'y'}, 'male'), ({'firstchar': 'F', 'lastchar': 'c'}, 'male')] >>> allnames_feats[-5:] [({'firstchar': 'C', 'lastchar': 'b'}, 'female'), ({'firstchar': 'S', 'lastchar': 'd'}, 'male'), ({'firstchar': 'S', 'lastchar': 'n'}, 'female'), ({'firstchar': 'S', 'lastchar': 'd'}, 'male'), ({'firstchar': 'D', 'lastchar': 's'}, 'male')] #---------------------------- Partition feature list into test and train set >>> test_set = allnames_feats[:500] >>> train_set = allnames_feats[500:] >>> len(test_set), len(train_set) (500, 7444) #-------------------------------------------------- Now train a NB classifier >>> boyorgirl = nltk.NaiveBayesClassifier.train(train_set) >>> dir(boyorgirl) ['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_feature_probdist', '_label_probdist', '_labels', 'classify', 'classify_many', 'labels', 'most_informative_features', 'prob_classify', 'prob_classify_many', 'show_most_informative_features', 'train'] >>> boyorgirl.labels() ['male', 'female'] #------------------------------------------- Trying classifier on new names >>> boyorgirl.classify('Neo') Traceback (most recent call last): File "", line 1, in boyorgirl.classify('Neo') File "C:\Program Files\Python311\Lib\site-packages\nltk\classify\naivebayes.py", line 89, in classify return self.prob_classify(featureset).max() File "C:\Program Files\Python311\Lib\site-packages\nltk\classify\naivebayes.py", line 95, in prob_classify featureset = featureset.copy() AttributeError: 'str' object has no attribute 'copy' # Oops, can't directly classfy name string >>> gender_features('Neo') {'firstchar': 'N', 'lastchar': 'o'} >>> boyorgirl.classify(gender_features('Neo')) # classify on features 'male' >>> boyorgirl.classify(gender_features('Na-Rae')) 'female' >>> gender_features('Na-Rae') {'firstchar': 'N', 'lastchar': 'e'} #------------------------- Evaluating classifier's performance on test set >>> nltk.classify.accuracy(boyorgirl, test_set) 0.816 >>> test_set[0] ({'firstchar': 'F', 'lastchar': 'd'}, 'female') >>> test_set[1] ({'firstchar': 'G', 'lastchar': 't'}, 'female') #------------------------------------- What are most informative features? >>> boyorgirl.show_most_informative_features(30) Most Informative Features lastchar = 'a' female : male = 37.2 : 1.0 lastchar = 'k' male : female = 32.9 : 1.0 lastchar = 'f' male : female = 15.8 : 1.0 lastchar = 'p' male : female = 11.8 : 1.0 lastchar = 'm' male : female = 11.1 : 1.0 lastchar = 'd' male : female = 10.1 : 1.0 lastchar = 'v' male : female = 9.1 : 1.0 lastchar = 'o' male : female = 7.7 : 1.0 lastchar = 'r' male : female = 6.6 : 1.0 lastchar = 'w' male : female = 5.8 : 1.0 lastchar = 'g' male : female = 5.2 : 1.0 firstchar = 'W' male : female = 5.1 : 1.0 lastchar = 'z' male : female = 4.6 : 1.0 lastchar = 's' male : female = 4.2 : 1.0 lastchar = 't' male : female = 4.0 : 1.0 lastchar = 'j' male : female = 3.9 : 1.0 lastchar = 'i' female : male = 3.6 : 1.0 lastchar = 'b' male : female = 3.5 : 1.0 lastchar = 'u' male : female = 3.0 : 1.0 firstchar = 'Q' male : female = 2.6 : 1.0 firstchar = 'U' male : female = 2.5 : 1.0 firstchar = 'K' female : male = 2.3 : 1.0 firstchar = 'H' male : female = 2.2 : 1.0 lastchar = 'n' male : female = 2.1 : 1.0 firstchar = 'X' male : female = 2.0 : 1.0 firstchar = 'Z' male : female = 1.9 : 1.0 lastchar = 'x' male : female = 1.9 : 1.0 lastchar = 'e' female : male = 1.7 : 1.0 lastchar = 'l' male : female = 1.7 : 1.0 firstchar = 'L' female : male = 1.7 : 1.0