# For HW3: https://sites.pitt.edu/~naraehan/ling1330/hw3.html
# Your name, email
# Date

def print_by_n(li, n):
    """A custom function for space-efficient list printing.
    Prints the input list n items per line."""
    for (i, item) in enumerate(li):
        print(item, end=" ") 
        if i % n == n-1: print()
    if len(li)% n !=0: print()    # Without this, no newline in some cases

# ------------------------------------------------------------------------
#                                                              PREPARATION
print("...Importing and reading data files...")
# ------------------------------------------------------------------------

# [A1] Import libraries and functions here. 
# YOUR CODE BELOW.

# [A2] Unpickle your Google k-band dictionary. 
# YOUR CODE BELOW.


# ------------------------------------------------------------------------
#                                                          LOADING CORPORA
print("...Loading Bulgarian and Japanese corpora...")
# ------------------------------------------------------------------------

# [B1] Set your corpus root. 
# YOUR CODE BELOW.
corpus_root = 'foo'

# [B2] Read in the two corpora.
# EDIT THE CODE BELOW.
bucor = 'foo'
jacor = 'foo'

# [B3] Print out some basic specs of the two corpora. First off, # of files.  
# YOUR CODE BELOW.

# [B4] Now, print total # of sentences and # of words.   
# YOUR CODE BELOW.


# ------------------------------------------------------------------------
#                                                    BUILDING DATA OBJECTS
print("...Building data objects...")
# ------------------------------------------------------------------------

# [C1] Build lowercased token lists.  
# EDIT THE CODE BELOW.
bu_toks = []
ja_toks = []

# [C2] Build word (unigram) frequency distributions.
# EDIT THE CODE BELOW.
bu_tokfd = []
ja_tokfd = []

# [C3] Build word bigrams. You should cast them as *lists*.  
# EDIT THE CODE BELOW.
bu_bigrams = []
ja_bigrams = []

# [C4] Build bigram frequency distributions.  
# EDIT THE CODE BELOW.
bu_bigramfd = "foo"
ja_bigramfd = "foo"

# [C5] Build conditional frequency distributions of the two bigram lists. 
# (You won't likely need these, but just in case you want to explore.)  
# EDIT THE CODE BELOW.
bu_bigramcfd = "foo"
ja_bigramcfd = "foo"


# ------------------------------------------------------------------------
#                                         MEASUREMENTS FOR WRITING QUALITY
print("...Computing measurements for writing quality...")
# ------------------------------------------------------------------------

# [D1] Calculate and print out the average essay length for each.
# YOUR CODE BELOW. 

# [D2] Calculate and print out the average sentence length for each.
# YOUR CODE BELOW. 

# [D3] Calculate and print out the TTR (type-token ratio) for each.
# YOUR CODE BELOW. 

# [D4] Calculate and print out the average word length for each.
# YOUR CODE BELOW. 

# [D5] Using the k-band dictionary, calculate and print out the average
# vocabulary band of each corpus. 
# YOUR CODE BELOW.

# [D6] Build a list of Bulgarian and Japanese word types that are in band 11 
# or higher. Calculate their % against the type count. Print out some examples.
# YOUR CODE BELOW. 


# ------------------------------------------------------------------------
#                                                       N-GRAM FREQUENCIES
print("...Examining unigram and bigram frequencies...")
# ------------------------------------------------------------------------

# [E1] Print the top 50 unigram frequencies of the two corpora.
# Do NOT print each unigram in a line... that would result in LONG output. 
# Instead, use the print_by_n custom function above to print out, say,
# 5 items per line, like: print_by_n(xxfd.most_common(50), 5)
# YOUR CODE BELOW.

# [E2] Print the top 50 bigram frequencies of the two corpora.
# Again, print n items per line using the print_by_n function. 
# YOUR CODE BELOW.