Python 3.11.4 (tags/v3.11.4:d2340ef, Jun 7 2023, 05:45:37) [MSC v.1934 64 bit (AMD64)] on win32 Type "help", "copyright", "credits" or "license()" for more information. import nltk >>> br_tw = nltk.corpus.brown.tagged_words(categories='mystery') >>> len(br_tw) 57169 >>> br_tw[:5] [('There', 'EX'), ('were', 'BED'), ('thirty-eight', 'CD'), ('patients', 'NNS'), ('on', 'IN')] # ---------------------------- Lowercasing tokens >>> br_tw_lower = [(w.lower(),t) for (w,t) in br_tw] >>> br_tw_lower[:5] [('there', 'EX'), ('were', 'BED'), ('thirty-eight', 'CD'), ('patients', 'NNS'), ('on', 'IN')] # ----------------------------- POS frequencies of word types >>> br_cfd = nltk.ConditionalFreqDist(br_tw_lower) >>> br_cfd['so'] FreqDist({'RB': 48, 'QL': 44, 'CS': 34}) >>> br_cfd['question'] FreqDist({'NN': 5, 'VB': 3}) >>> br_cfd['question'].freq('NN') 0.625 >>> br_cfd['question'].freq('VB') 0.375 >>> br_cfd['like'] FreqDist({'CS': 103, 'VB': 19, 'IN': 14, 'JJ': 3}) >>> nltk.help.brown_tagset('CS') CS: conjunction, subordinating that as after whether before while like because if since for than altho until so unless though providing once lest s'posin' till whereas whereupon supposing tho' albeit then so's 'fore >>> nltk.help.brown_tagset('IN') IN: preposition of in for by considering to on among at through with under into regarding than since despite according per before toward against as after during including between without except upon out over ... >>> nltk.help.brown_tagset('VB') VB: verb, base: uninflected present, imperative or infinitive investigate find act follow inure achieve reduce take remedy re-set distribute realize disable feel receive continue place protect eliminate elaborate work permit run enter force ... >>> nltk.help.brown_tagset('V.*') VB: verb, base: uninflected present, imperative or infinitive investigate find act follow inure achieve reduce take remedy re-set distribute realize disable feel receive continue place protect eliminate elaborate work permit run enter force ... VB+AT: verb, base: uninflected present or infinitive + article wanna VB+IN: verb, base: uninflected present, imperative or infinitive + preposition lookit VB+JJ: verb, base: uninflected present, imperative or infinitive + adjective die-dead VB+PPO: verb, uninflected present tense + pronoun, personal, accusative let's lemme gimme VB+RP: verb, imperative + adverbial particle g'ahn c'mon VB+TO: verb, base: uninflected present, imperative or infinitive + infinitival to wanta wanna VB+VB: verb, base: uninflected present, imperative or infinitive; hypenated pair say-speak VBD: verb, past tense said produced took recommended commented urged found added praised charged listed became announced brought attended wanted voted defeated received got stood shot scheduled feared promised made ... VBG: verb, present participle or gerund modernizing improving purchasing Purchasing lacking enabling pricing keeping getting picking entering voting warning making strengthening setting neighboring attending participating moving ... VBG+TO: verb, present participle + infinitival to gonna VBN: verb, past participle conducted charged won received studied revised operated accepted combined experienced recommended effected granted seen protected adopted retarded notarized selected composed gotten printed ... VBN+TO: verb, past participle + infinitival to gotta VBZ: verb, present tense, 3rd person singular deserves believes receives takes goes expires says opposes starts permits expects thinks faces votes teaches holds calls fears spends collects backs eliminates sets flies gives seeks reads ... # ----------------------------- building trigrams >>> br_tw_3grams = list(nltk.ngrams(br_tw_lower, 3)) >>> br_tw_3grams[0] (('there', 'EX'), ('were', 'BED'), ('thirty-eight', 'CD')) >>> br_tw_3grams[1] (('were', 'BED'), ('thirty-eight', 'CD'), ('patients', 'NNS')) >>> br_tw_3grams[2] (('thirty-eight', 'CD'), ('patients', 'NNS'), ('on', 'IN')) >>> br_tw_3grams[-1] (('me', 'PPO'), ("''", "''"), ('.', '.')) # --------------------------- conditional frequency of preceding POS >>> br_pre = [(w2+"/"+t2, t1) for ((w1,t1), (w2,t2), (w3,t3)) in br_tw_3grams] >>> br_pre[0] ('were/BED', 'EX') >>> br_pre[:5] [('were/BED', 'EX'), ('thirty-eight/CD', 'BED'), ('patients/NNS', 'CD'), ('on/IN', 'NNS'), ('the/AT', 'IN')] >>> br_pre_cfd = nltk.ConditionalFreqDist(br_pre) >>> br_pre_cfd['so/CS'] FreqDist({'.': 7, ',': 6, 'NN': 5, 'NNS': 3, 'QLP': 2, 'PPO': 2, 'VBD': 2, '--': 2, 'RB': 1, 'CC': 1, ...}) >>> br_pre_cfd['so/QL'] FreqDist({'IN': 5, 'BEDZ': 5, ',': 4, 'RB': 3, 'NN': 3, 'QL': 3, 'VBN': 2, 'VB': 2, 'PPO': 2, 'BEN': 1, ...}) >>> br_pre_cfd['so/QL'].most_common(5) [('IN', 5), ('BEDZ', 5), (',', 4), ('RB', 3), ('NN', 3)] >>> nltk.help.brown_tagset('BEDZ') BEDZ: verb 'to be', past tense, 1st and 3rd person singular was >>> br_pre_cfd['question/VB'] FreqDist({'TO': 2, 'DO*': 1}) >>> nltk.help.brown_tagset('DO*') DO*: verb 'to do', uninflected present tense or imperative, negated don't >>> br_pre_cfd['question/NN'] FreqDist({'JJ': 2, 'CD': 1, 'AT': 1, 'OD': 1}) >>> br_pre_cfd['like/VB'] FreqDist({'PPSS': 8, 'PPSS+MD': 5, 'NNS': 2, 'DOD*': 1, 'DO*': 1, 'TO': 1, 'MD*': 1}) >>> nltk.help.brown_tagset('PPSS') PPSS: pronoun, personal, nominative, not 3rd person singular they we I you ye thou you'uns >>> br_pre_cfd['like/CS'] FreqDist({'NN': 18, ',': 14, 'RB': 11, 'VB': 8, 'VBD': 7, 'BEDZ': 5, 'NNS': 5, 'VBN': 4, 'RBR': 4, 'VBG': 4, ...}) >>> br_pre_cfd['like/IN'] FreqDist({'BEDZ*': 2, 'PN': 2, 'BED': 2, 'NN': 2, 'VB': 1, 'NNS': 1, 'VBG': 1, 'RB': 1, 'NP': 1, 'RBR': 1}) >>> nltk.help.brown_tagset('BEDZ*') BEDZ*: verb 'to be', past tense, 1st and 3rd person singular, negated wasn't # ---------------------------- Now onto Penn Treebank >>> from nltk.corpus import treebank >>> treebank.words() ['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', ...] >>> treebank.tagged_words() [('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ...] >>> treebank.tagged_words()[:10] [('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT')] >>> treebank.tagged_sents() [[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')], [('Mr.', 'NNP'), ('Vinken', 'NNP'), ('is', 'VBZ'), ('chairman', 'NN'), ('of', 'IN'), ('Elsevier', 'NNP'), ('N.V.', 'NNP'), (',', ','), ('the', 'DT'), ('Dutch', 'NNP'), ('publishing', 'VBG'), ('group', 'NN'), ('.', '.')], ...] >>> treebank.tagged_sents()[0] [('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')] >>> treebank.tagged_sents()[1] [('Mr.', 'NNP'), ('Vinken', 'NNP'), ('is', 'VBZ'), ('chairman', 'NN'), ('of', 'IN'), ('Elsevier', 'NNP'), ('N.V.', 'NNP'), (',', ','), ('the', 'DT'), ('Dutch', 'NNP'), ('publishing', 'VBG'), ('group', 'NN'), ('.', '.')] >>> treebank.tagged_sents()[-1] [('Trinity', 'NNP'), ('said', 'VBD'), ('0', '-NONE-'), ('it', 'PRP'), ('plans', 'VBZ'), ('*-1', '-NONE-'), ('to', 'TO'), ('begin', 'VB'), ('delivery', 'NN'), ('in', 'IN'), ('the', 'DT'), ('first', 'JJ'), ('quarter', 'NN'), ('of', 'IN'), ('next', 'JJ'), ('year', 'NN'), ('.', '.')] >>> treebank.parsed_sents()[0] Tree('S', [Tree('NP-SBJ', [Tree('NP', [Tree('NNP', ['Pierre']), Tree('NNP', ['Vinken'])]), Tree(',', [',']), Tree('ADJP', [Tree('NP', [Tree('CD', ['61']), Tree('NNS', ['years'])]), Tree('JJ', ['old'])]), Tree(',', [','])]), Tree('VP', [Tree('MD', ['will']), Tree('VP', [Tree('VB', ['join']), Tree('NP', [Tree('DT', ['the']), Tree('NN', ['board'])]), Tree('PP-CLR', [Tree('IN', ['as']), Tree('NP', [Tree('DT', ['a']), Tree('JJ', ['nonexecutive']), Tree('NN', ['director'])])]), Tree('NP-TMP', [Tree('NNP', ['Nov.']), Tree('CD', ['29'])])])]), Tree('.', ['.'])]) # --------------------------- Sytactic trees are RECURSIVE! >>> treebank.parsed_sents()[0].pprint() (S (NP-SBJ (NP (NNP Pierre) (NNP Vinken)) (, ,) (ADJP (NP (CD 61) (NNS years)) (JJ old)) (, ,)) (VP (MD will) (VP (VB join) (NP (DT the) (NN board)) (PP-CLR (IN as) (NP (DT a) (JJ nonexecutive) (NN director))) (NP-TMP (NNP Nov.) (CD 29)))) (. .)) >>> treebank.parsed_sents()[0].draw() # --------------------------- Penn Treebank POS distribution >>> treebank.tagged_words()[:10] [('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT')] >>> treebank_tw_lower = [(w.lower(),t) for (w,t) in treebank.tagged_words()] >>> treebank.tagged_words()[:10] [('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT')] >>> treebank_tw_lower[:10] [('pierre', 'NNP'), ('vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT')] >>> treebank_cfd = nltk.ConditionalFreqDist(treebank_tw_lower) >>> treebank_cfd['question'] FreqDist({'NN': 12, 'VBP': 1, 'VB': 1}) >>> treebank_cfd['so'] FreqDist({'RB': 66, 'IN': 27}) # only RB and IN? >>> nltk.help.upenn_tagset('RB') RB: adverb occasionally unabatingly maddeningly adventurously professedly stirringly prominently technologically magisterially predominately swiftly fiscally pitilessly ... >>> nltk.help.upenn_tagset('IN') IN: preposition or conjunction, subordinating astride among uppon whether out inside pro despite on by throughout below within for towards near behind atop around if like until below next into if beside ... # --------------------------- Treebank treats CS usage of 'so' as part of its preposition (IN) use and its QL (qualifier) use as RB (adverb) broadly >>> treebank_cfd['share'] FreqDist({'NN': 117, 'VB': 3}) # --------------------------- 'share' is overwhelmingly a noun?? Digging deeper... >>> dir(treebank) ['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_citation', '_comment_char', '_detect_blocks', '_encoding', '_fileids', '_get_root', '_license', '_normalize', '_parse', '_read_block', '_read_parsed_sent_block', '_read_sent_block', '_read_tagged_sent_block', '_read_tagged_word_block', '_read_word_block', '_readme', '_root', '_tag', '_tagset', '_unload', '_word', 'abspath', 'abspaths', 'citation', 'encoding', 'ensure_loaded', 'fileids', 'license', 'open', 'parsed_sents', 'raw', 'readme', 'root', 'sents', 'tagged_sents', 'tagged_words', 'words'] >>> treebank.readme()[:500] '[ PENN TREEBANK SAMPLE ]\r\nhttp://www.cis.upenn.edu/~treebank/home.html\r\n\r\nThis is a ~5% fragment of Penn Treebank, (C) LDC 1995. It is made\r\navailable under fair use for the purposes of illustrating NLTK tools\r\nfor tokenizing, tagging, chunking and parsing. This data is for\r\nnon-commercial use only.\r\n\r\nContents: raw, tagged, parsed and combined data from Wall Street\r\nJournal for 1650 sentences (99 treebank files wsj_0001 .. wsj_0099).\r\nFor details about each of the four types, please see the o' >>> print(treebank.readme()[:500]) [ PENN TREEBANK SAMPLE ] http://www.cis.upenn.edu/~treebank/home.html This is a ~5% fragment of Penn Treebank, (C) LDC 1995. It is made available under fair use for the purposes of illustrating NLTK tools for tokenizing, tagging, chunking and parsing. This data is for non-commercial use only. Contents: raw, tagged, parsed and combined data from Wall Street Journal for 1650 sentences (99 treebank files wsj_0001 .. wsj_0099). For details about each of the four types, please see the o # --------------------------- More tag ambiguity >>> treebank_cfd['will'] FreqDist({'MD': 280, 'NN': 1}) >>> treebank_cfd['fly'] FreqDist({'VB': 1, 'VBP': 1}) >>> treebank_cfd['like'] FreqDist({'IN': 49, 'VB': 8, 'VBP': 4, 'JJ': 1}) # --------------------------- Penn Treebank tagset definitions >>> nltk.help.upenn_tagset('VB') VB: verb, base form ask assemble assess assign assume atone attention avoid bake balkanize bank begin behold believe bend benefit bevel beware bless boil bomb boost brace break bring broil brush build ... >>> nltk.help.upenn_tagset('V.*') VB: verb, base form ask assemble assess assign assume atone attention avoid bake balkanize bank begin behold believe bend benefit bevel beware bless boil bomb boost brace break bring broil brush build ... VBD: verb, past tense dipped pleaded swiped regummed soaked tidied convened halted registered cushioned exacted snubbed strode aimed adopted belied figgered speculated wore appreciated contemplated ... VBG: verb, present participle or gerund telegraphing stirring focusing angering judging stalling lactating hankerin' alleging veering capping approaching traveling besieging encrypting interrupting erasing wincing ... VBN: verb, past participle multihulled dilapidated aerosolized chaired languished panelized used experimented flourished imitated reunifed factored condensed sheared unsettled primed dubbed desired ... VBP: verb, present tense, not 3rd person singular predominate wrap resort sue twist spill cure lengthen brush terminate appear tend stray glisten obtain comprise detest tease attract emphasize mold postpone sever return wag ... VBZ: verb, present tense, 3rd person singular bases reconstructs marks mixes displeases seals carps weaves snatches slumps stretches authorizes smolders pictures emerges stockpiles seduces fizzes uses bolsters slaps speaks pleads ...