# Natural Language Toolkit: Naive Bayes Classifiers # # Copyright (C) 2001-2023 NLTK Project # Author: Edward Loper # URL: # For license information, see LICENSE.TXT # ------------------ # Edited October 2023 # Two additional functions by Na-Rae Han, naraehan@pitt.edu: # feature_weights(self, fname, fval) # show_most_informative_feats_all(self, n=10) """ A classifier based on the Naive Bayes algorithm. In order to find the probability for a label, this algorithm first uses the Bayes rule to express P(label|features) in terms of P(label) and P(features|label): | P(label) * P(features|label) | P(label|features) = ------------------------------ | P(features) The algorithm then makes the 'naive' assumption that all features are independent, given the label: | P(label) * P(f1|label) * ... * P(fn|label) | P(label|features) = -------------------------------------------- | P(features) Rather than computing P(features) explicitly, the algorithm just calculates the numerator for each label, and normalizes them so they sum to one: | P(label) * P(f1|label) * ... * P(fn|label) | P(label|features) = -------------------------------------------- | SUM[l]( P(l) * P(f1|l) * ... * P(fn|l) ) """ from collections import defaultdict from nltk.classify.api import ClassifierI from nltk.probability import DictionaryProbDist, ELEProbDist, FreqDist, sum_logs ##////////////////////////////////////////////////////// ## Naive Bayes Classifier ##////////////////////////////////////////////////////// class NaiveBayesClassifier(ClassifierI): """ A Naive Bayes classifier. Naive Bayes classifiers are paramaterized by two probability distributions: - P(label) gives the probability that an input will receive each label, given no information about the input's features. - P(fname=fval|label) gives the probability that a given feature (fname) will receive a given value (fval), given that the label (label). If the classifier encounters an input with a feature that has never been seen with any label, then rather than assigning a probability of 0 to all labels, it will ignore that feature. The feature value 'None' is reserved for unseen feature values; you generally should not use 'None' as a feature value for one of your own features. """ def __init__(self, label_probdist, feature_probdist): """ :param label_probdist: P(label), the probability distribution over labels. It is expressed as a ``ProbDistI`` whose samples are labels. I.e., P(label) = ``label_probdist.prob(label)``. :param feature_probdist: P(fname=fval|label), the probability distribution for feature values, given labels. It is expressed as a dictionary whose keys are ``(label, fname)`` pairs and whose values are ``ProbDistI`` objects over feature values. I.e., P(fname=fval|label) = ``feature_probdist[label,fname].prob(fval)``. If a given ``(label,fname)`` is not a key in ``feature_probdist``, then it is assumed that the corresponding P(fname=fval|label) is 0 for all values of ``fval``. """ self._label_probdist = label_probdist self._feature_probdist = feature_probdist self._labels = list(label_probdist.samples()) def labels(self): return self._labels def classify(self, featureset): return self.prob_classify(featureset).max() def prob_classify(self, featureset): # Discard any feature names that we've never seen before. # Otherwise, we'll just assign a probability of 0 to # everything. featureset = featureset.copy() for fname in list(featureset.keys()): for label in self._labels: if (label, fname) in self._feature_probdist: break else: # print('Ignoring unseen feature %s' % fname) del featureset[fname] # Find the log probability of each label, given the features. # Start with the log probability of the label itself. logprob = {} for label in self._labels: logprob[label] = self._label_probdist.logprob(label) # Then add in the log probability of features given labels. for label in self._labels: for (fname, fval) in featureset.items(): if (label, fname) in self._feature_probdist: feature_probs = self._feature_probdist[label, fname] logprob[label] += feature_probs.logprob(fval) else: # nb: This case will never come up if the # classifier was created by # NaiveBayesClassifier.train(). logprob[label] += sum_logs([]) # = -INF. return DictionaryProbDist(logprob, normalize=True, log=True) def show_most_informative_features(self, n=10): # Determine the most relevant features, and display them. cpdist = self._feature_probdist print("Most Informative Features") for (fname, fval) in self.most_informative_features(n): def labelprob(l): return cpdist[l, fname].prob(fval) labels = sorted( (l for l in self._labels if fval in cpdist[l, fname].samples()), key=lambda element: (-labelprob(element), element), reverse=True, ) if len(labels) == 1: continue l0 = labels[0] l1 = labels[-1] if cpdist[l0, fname].prob(fval) == 0: ratio = "INF" else: ratio = "%8.1f" % ( cpdist[l1, fname].prob(fval) / cpdist[l0, fname].prob(fval) ) print( "%24s = %-14r %6s : %-6s = %s : 1.0" % (fname, fval, ("%s" % l1)[:6], ("%s" % l0)[:6], ratio) ) def most_informative_features(self, n=100): """ Return a list of the 'most informative' features used by this classifier. For the purpose of this function, the informativeness of a feature ``(fname,fval)`` is equal to the highest value of P(fname=fval|label), for any label, divided by the lowest value of P(fname=fval|label), for any label: | max[ P(fname=fval|label1) / P(fname=fval|label2) ] """ if hasattr(self, "_most_informative_features"): return self._most_informative_features[:n] else: # The set of (fname, fval) pairs used by this classifier. features = set() # The max & min probability associated w/ each (fname, fval) # pair. Maps (fname,fval) -> float. maxprob = defaultdict(lambda: 0.0) minprob = defaultdict(lambda: 1.0) for (label, fname), probdist in self._feature_probdist.items(): for fval in probdist.samples(): feature = (fname, fval) features.add(feature) p = probdist.prob(fval) maxprob[feature] = max(p, maxprob[feature]) minprob[feature] = min(p, minprob[feature]) if minprob[feature] == 0: features.discard(feature) # Convert features to a list, & sort it by how informative # features are. self._most_informative_features = sorted( features, key=lambda feature_: ( minprob[feature_] / maxprob[feature_], feature_[0], feature_[1] in [None, False, True], str(feature_[1]).lower(), ), ) return self._most_informative_features[:n] @classmethod def train(cls, labeled_featuresets, estimator=ELEProbDist): """ :param labeled_featuresets: A list of classified featuresets, i.e., a list of tuples ``(featureset, label)``. """ label_freqdist = FreqDist() feature_freqdist = defaultdict(FreqDist) feature_values = defaultdict(set) fnames = set() # Count up how many times each feature value occurred, given # the label and featurename. for featureset, label in labeled_featuresets: label_freqdist[label] += 1 for fname, fval in featureset.items(): # Increment freq(fval|label, fname) feature_freqdist[label, fname][fval] += 1 # Record that fname can take the value fval. feature_values[fname].add(fval) # Keep a list of all feature names. fnames.add(fname) # If a feature didn't have a value given for an instance, then # we assume that it gets the implicit value 'None.' This loop # counts up the number of 'missing' feature values for each # (label,fname) pair, and increments the count of the fval # 'None' by that amount. for label in label_freqdist: num_samples = label_freqdist[label] for fname in fnames: count = feature_freqdist[label, fname].N() # Only add a None key when necessary, i.e. if there are # any samples with feature 'fname' missing. if num_samples - count > 0: feature_freqdist[label, fname][None] += num_samples - count feature_values[fname].add(None) # Create the P(label) distribution label_probdist = estimator(label_freqdist) # Create the P(fval|label, fname) distribution feature_probdist = {} for ((label, fname), freqdist) in feature_freqdist.items(): probdist = estimator(freqdist, bins=len(feature_values[fname])) feature_probdist[label, fname] = probdist return cls(label_probdist, feature_probdist) def feature_weights(self, fname, fval): """ Displays per label the probability of the feature+value. Added By Na-Rae Han, October 2023 """ cpdist = self._feature_probdist wdict = {} for l in self._labels: wdict[l] = cpdist[l,fname].prob(fval) return wdict def show_most_informative_feats_all(self, n=10): """ Displays top n most informative features. Unlike show_most_informative_features, includes odds ratio of (feature, value) pairs that do not co-occur with all labels. For example, if 'contains-whale':1 was found to occur with label 'Melville' but not with 'Austen' at all, show_most_informative_features does not list it in the result. This variant does, however, which means that the top ranks are likely to be dominated by such features: their ratios will have as the denominator the un-observed feature weight, which will be some miniscule positive value instead of 0 that was assigned for the purpose of smoothing. These are marked with "~" instead of "=". Added By Na-Rae Han, October 2023 """ cpdist = self._feature_probdist wdict = {} # dictionary of feature weights wrdict = {} # dictionary of odds ratio label_pairs = [] for i in range(len(self._labels)-1): label_pairs.append((self._labels[i], self._labels[i+1])) fname_val = set() # a set of (featurename, value) tuples for (label, fname), probdist in cpdist.items(): for fval in probdist.samples(): # only fvals observed with label wdict[((fname, fval), label)] = cpdist[label,fname].prob(fval) fname_val.add( (fname, fval) ) # Create an odds-ratio dictionary smoothed=set() # (fname,fval) not observed with l1 or l2 for (fname,fval) in fname_val: for (l1, l2) in label_pairs: # (fname,fval) not observed with l1 or l2, will not be in wdict # retrieve ~0 weight from .prob(fval) if ((fname, fval), l1) not in wdict: wdict[((fname, fval), l1)] = cpdist[l1,fname].prob(fval) smoothed.add((fname,fval)) if ((fname, fval), l2) not in wdict: wdict[((fname, fval), l2)] = cpdist[l2,fname].prob(fval) smoothed.add((fname,fval)) w_l1 = wdict[((fname, fval), l1)] w_l2 = wdict[((fname, fval), l2)] wrdict[((fname,fval),(l1,l2))] = w_l1 / w_l2 wrdict[((fname,fval),(l2,l1))] = w_l2 / w_l1 top = sorted(wrdict, key=wrdict.get, reverse=True)[:n] print("Most Informative Features, alternate version.") print("List does not omit feature+value pairs un-observed with some labels") print(" which are marked with '~' instead of '='") print("--------------------") for (fnfv,(l1,l2)) in top: printout = '%30s %6s : %-6s = %8.1f : 1.0' if fnfv in smoothed: printout = '%30s %6s : %-6s ~ %8.1f : 1.0' # use "~" instead of "=" # if feature+value not observed with all labels print (printout % (fnfv, str(l1)[:6], str(l2)[:6], wrdict[(fnfv,(l1,l2))])) ##////////////////////////////////////////////////////// ## Demo ##////////////////////////////////////////////////////// def demo(): from nltk.classify.util import names_demo classifier = names_demo(NaiveBayesClassifier.train) classifier.show_most_informative_features() if __name__ == "__main__": demo()