# For HW3: https://sites.pitt.edu/~naraehan/ling1330/hw3.html # Your name, email # Date def print_by_n(li, n): """A custom function for space-efficient list printing. Prints the input list n items per line.""" for (i, item) in enumerate(li): print(item, end=" ") if i % n == n-1: print() if len(li)% n !=0: print() # Without this, no newline in some cases # ------------------------------------------------------------------------ # PREPARATION print("...Importing and reading data files...") # ------------------------------------------------------------------------ # [A1] Import libraries and functions here. # YOUR CODE BELOW. # [A2] Unpickle your Google k-band dictionary. # YOUR CODE BELOW. # ------------------------------------------------------------------------ # LOADING CORPORA print("...Loading Bulgarian and Japanese corpora...") # ------------------------------------------------------------------------ # [B1] Set your corpus root. # YOUR CODE BELOW. corpus_root = 'foo' # [B2] Read in the two corpora. # EDIT THE CODE BELOW. bucor = 'foo' jacor = 'foo' # [B3] Print out some basic specs of the two corpora. First off, # of files. # YOUR CODE BELOW. # [B4] Now, print total # of sentences and # of words. # YOUR CODE BELOW. # ------------------------------------------------------------------------ # BUILDING DATA OBJECTS print("...Building data objects...") # ------------------------------------------------------------------------ # [C1] Build lowercased token lists. # EDIT THE CODE BELOW. bu_toks = [] ja_toks = [] # [C2] Build word (unigram) frequency distributions. # EDIT THE CODE BELOW. bu_tokfd = [] ja_tokfd = [] # [C3] Build word bigrams. You should cast them as *lists*. # EDIT THE CODE BELOW. bu_bigrams = [] ja_bigrams = [] # [C4] Build bigram frequency distributions. # EDIT THE CODE BELOW. bu_bigramfd = "foo" ja_bigramfd = "foo" # [C5] Build conditional frequency distributions of the two bigram lists. # (You won't likely need these, but just in case you want to explore.) # EDIT THE CODE BELOW. bu_bigramcfd = "foo" ja_bigramcfd = "foo" # ------------------------------------------------------------------------ # MEASUREMENTS FOR WRITING QUALITY print("...Computing measurements for writing quality...") # ------------------------------------------------------------------------ # [D1] Calculate and print out the average essay length for each. # YOUR CODE BELOW. # [D2] Calculate and print out the average sentence length for each. # YOUR CODE BELOW. # [D3] Calculate and print out the TTR (type-token ratio) for each. # YOUR CODE BELOW. # [D4] Calculate and print out the average word length for each. # YOUR CODE BELOW. # [D5] Using the k-band dictionary, calculate and print out the average # vocabulary band of each corpus. # YOUR CODE BELOW. # [D6] Build a list of Bulgarian and Japanese word types that are in band 11 # or higher. Calculate their % against the type count. Print out some examples. # YOUR CODE BELOW. # ------------------------------------------------------------------------ # N-GRAM FREQUENCIES print("...Examining unigram and bigram frequencies...") # ------------------------------------------------------------------------ # [E1] Print the top 50 unigram frequencies of the two corpora. # Do NOT print each unigram in a line... that would result in LONG output. # Instead, use the print_by_n custom function above to print out, say, # 5 items per line, like: print_by_n(xxfd.most_common(50), 5) # YOUR CODE BELOW. # [E2] Print the top 50 bigram frequencies of the two corpora. # Again, print n items per line using the print_by_n function. # YOUR CODE BELOW.