from src.utils.libimports.multinomialnb import *

# Training data
X_train = np.asarray([
    "mayor elect term term",
    "mayor goal term win",
    "goal term win vote",
    "term goal term goal",
    "goal team player win",
    "win team win player",
    "player vote player term"
])

# Labels for training data
y_train = np.asarray(["politics", "politics", "politics", "politics", "sports", "sports", "sports"])

# Extract vocabulary
V = set([word for doc in X_train for word in doc.split()])

def calculate_priors(y_train):
    classes, counts = np.unique(y_train, return_counts=True)
    return dict(zip(classes, counts/len(y_train)))

priors = calculate_priors(y_train)
print(priors)

{'politics': 0.5714285714285714, 'sports': 0.42857142857142855}

def calculate_counts(X_train, y_train):
    # Get set of unique class labels
    classes = np.unique(y_train)
    # Initialize dictionary holding counts
    counts = { label:defaultdict(int) for label in classes }
    # Iterate over each class
    for c in classes:
        # Get the indices of all documents of that class
        class_indices = np.where(y_train == c)[0]
        # Iterate over all documents and words
        for sentence in X_train[class_indices]:
            for word in sentence.split():
                counts[c]["__ALL__"] += 1
                counts[c][word] += 1
    return counts

counts = calculate_counts(X_train, y_train)

print(json.dumps(counts, indent=2))

{
  "politics": {
    "__ALL__": 16,
    "mayor": 2,
    "elect": 1,
    "term": 6,
    "goal": 4,
    "win": 2,
    "vote": 1
  },
  "sports": {
    "__ALL__": 12,
    "goal": 1,
    "team": 2,
    "player": 4,
    "win": 3,
    "vote": 1,
    "term": 1
  }
}

def calculate_likelihoods(counts, vocab, k=0):
    # Initialize dictionary
    likelihoods = { label:{} for label in counts.keys() }
    # Calculate likelihoods based on formula
    for label in counts.keys():
        # Get number of words in all documents of current class
        total_word_count = counts[label]["__ALL__"]
        for word in vocab:
            # Get the count for the current word
            word_count = counts[label][word]
            # Calculate likelihood for current word and store in dictionary
            likelihoods[label][word] = (word_count + k) / (total_word_count + k*len(vocab))
    # Return the dictionary of likelihoods
    return likelihoods

likelihoods = calculate_likelihoods(counts, V, k=1)

print(json.dumps(likelihoods, indent=2))

{
  "politics": {
    "vote": 0.08333333333333333,
    "mayor": 0.125,
    "term": 0.2916666666666667,
    "goal": 0.20833333333333334,
    "elect": 0.08333333333333333,
    "win": 0.125,
    "team": 0.041666666666666664,
    "player": 0.041666666666666664
  },
  "sports": {
    "vote": 0.1,
    "mayor": 0.05,
    "term": 0.1,
    "goal": 0.1,
    "elect": 0.05,
    "win": 0.2,
    "team": 0.15,
    "player": 0.25
  }
}

def predict(doc, priors, likelihoods, verbose=False):
    # Initialize dictionary holding the posterior probabilities for all classes
    posteriors = { label:0 for label in priors.keys() }
    for label in priors.keys():
        # Find all likelihood probabilties for the given class label and words in the document
        likelihood_probs = np.asarray([ likelihoods[label][word] for word in doc.split() if word in likelihoods[label] ])
        # Calculate posterior by multiplying prior and all likelihood probabilities
        posteriors[label] = priors[label] * np.prod(likelihood_probs)
    # Show posterior probabilities if specified
    if verbose == True: print(f"Posteriors: {posteriors}")
    # Return the label (key of dictionary) with the maximum posterior (value of dictionary)
    return max(posteriors, key=posteriors.get)

doc = "mayor team term"
#doc = "player team term"
#doc = "player team term elect"
#doc = "player team term term"
#doc = "player team term term term"
#doc = "year match group victory"

predicted_class = predict(doc, priors, likelihoods, verbose=True)

print(f"The predicted class for the sentence '{doc}' is: {predicted_class}")

Posteriors: {'politics': 0.0008680555555555555, 'sports': 0.0003214285714285714}
The predicted class for the sentence 'mayor team term' is: politics

count_vectorizer = CountVectorizer()

X_train_vectorized = count_vectorizer.fit_transform(X_train)

# Convert to pandas dataframe -- just for a nice visualization
df_tdm = pd.DataFrame(X_train_vectorized.A.T, columns=[ "d{}".format(d+1) for d in range(len(X_train)) ])
df_tdm = df_tdm.set_index(pd.Index(count_vectorizer.get_feature_names_out()))
df_tdm

mnb_classifier = MultinomialNB().fit(X_train_vectorized, y_train)

mnb_classifier_prior = dict(zip(mnb_classifier.classes_, np.exp(mnb_classifier.class_log_prior_)))

print(mnb_classifier_prior)

{'politics': 0.5714285714285714, 'sports': 0.42857142857142866}

mnb_classifier_likelihoods = { label:{} for label in mnb_classifier.classes_ }

for i, label in enumerate(mnb_classifier.classes_):
    mnb_classifier_likelihoods[label] = dict(zip([ count_vectorizer.get_feature_names_out()[i] for i,_ in enumerate(mnb_classifier.feature_log_prob_[i]) ], 
                                                 np.exp(mnb_classifier.feature_log_prob_[i])))

print(json.dumps(mnb_classifier_likelihoods, indent=2))

{
  "politics": {
    "elect": 0.08333333333333333,
    "goal": 0.2083333333333333,
    "mayor": 0.12499999999999997,
    "player": 0.041666666666666664,
    "team": 0.041666666666666664,
    "term": 0.29166666666666663,
    "vote": 0.08333333333333333,
    "win": 0.12499999999999997
  },
  "sports": {
    "elect": 0.05000000000000001,
    "goal": 0.10000000000000002,
    "mayor": 0.05000000000000001,
    "player": 0.25,
    "team": 0.15000000000000002,
    "term": 0.10000000000000002,
    "vote": 0.10000000000000002,
    "win": 0.2
  }
}

doc = "mayor team term"
#doc = "player team term"
#doc = "player team term elect"
#doc = "player team term term"
#doc = "player team term term"
#doc = "year match group victory"

# Convert sentence into data matrix (containing just this one sample)
doc_vectorized = count_vectorizer.transform([doc])

print(f"The predicted class for the sentence '{doc}' is: {mnb_classifier.predict(doc_vectorized)[0]}")

The predicted class for the sentence 'mayor team term' is: politics

# Training data
sentences = np.asarray([
    "the mayor was elected for this term and next term",
    "a mayor's goal for the next term is to win",
    "the goal for this term was to win the vote",
    "this term's goals are next term's goals",
    "the goal of any team player is the win",
    "a win for the team is a win for each player",
    "players vote other players for another term"
])

count_vectorizer_unibitrigrams = CountVectorizer(ngram_range=(1,3))

X_train_vectorized_unibitrigrams = count_vectorizer_unibitrigrams.fit_transform(sentences)

# Convert to pandas dataframe -- just for a nice visualization
df_tdm_unibitrigrams = pd.DataFrame(X_train_vectorized_unibitrigrams.A.T, columns=[ "d{}".format(d+1) for d in range(len(X_train)) ])
df_tdm_unibitrigrams = df_tdm_unibitrigrams.set_index(pd.Index(count_vectorizer_unibitrigrams.get_feature_names_out()))
df_tdm_unibitrigrams

tfidf_vectorizer_unibitrigrams = TfidfVectorizer(ngram_range=(1,3))

X_train_vectorized_unibitrigrams = tfidf_vectorizer_unibitrigrams.fit_transform(sentences)

# Convert to pandas dataframe -- just for a nice visualization
df_tdm_unibitrigrams = pd.DataFrame(X_train_vectorized_unibitrigrams.A.T, columns=[ "d{}".format(d+1) for d in range(len(X_train)) ])
df_tdm_unibitrigrams = df_tdm_unibitrigrams.set_index(pd.Index(count_vectorizer_unibitrigrams.get_feature_names_out()))
df_tdm_unibitrigrams

Sentence	Class
The mayor was elected for this term and next term.	politics
A mayor's goal for the next term is to win.	politics
The goal for this term was to win the vote.	politics
This term's goals are next term's goals.	politics
The goal of any team player is the win.	sports
A win for the team is a win for each player.	sports
Players vote other players for another term.	sports

Sentence	Sentences (processed)	Class
The mayor was elected for this term and next term.	mayor elect term term	politics
A mayor's goal for the next term is to win.	mayor goal term win	politics
The goal for this term was to win the vote.	goal term win vote	politics
This term's goals are next term's goals.	term goal term goal	politics
The goal of any team player is the win.	goal team player win	sports
A win for the team is a win for each player.	win team win player	sports
Players vote other players for another term.	player vote player term	sports

$\large y_i$	$\large P(y_i)$
politics	$\large \frac{4}{7}$
sports	$\large \frac{3}{7}$

$\large w_i$	$\large P(w_i, politics)$	$\large count(w_i, sports)$
elect	$\large \frac{1+1}{16+8} = \frac{2}{24}$	$\large \frac{0+1}{12+8} = \frac{1}{20}$
goal	$\large \frac{3+1}{16+8} = \frac{4}{24}$	$\large \frac{2+1}{12+8} = \frac{3}{20}$
mayor	$\large \frac{2+1}{16+8} = \frac{3}{24}$	$\large \frac{0+1}{12+8} = \frac{1}{20}$
player	$\large \frac{0+1}{16+8} = \frac{1}{24}$	$\large \frac{4+1}{12+8} = \frac{5}{20}$
team	$\large \frac{0+1}{16+8} = \frac{1}{24}$	$\large \frac{2+1}{12+8} = \frac{3}{20}$
term	$\large \frac{6+1}{16+8} = \frac{7}{24}$	$\large \frac{1+1}{12+8} = \frac{2}{20}$
vote	$\large \frac{1+1}{16+8} = \frac{2}{24}$	$\large \frac{1+1}{12+8} = \frac{2}{20}$
win	$\large \frac{2+1}{16+8} = \frac{3}{24}$	$\large \frac{3+1}{12+8} = \frac{4}{20}$

	d1	d2	d3	d4	d5	d6	d7
elect	1	0	0	0	0	0	0
goal	0	1	1	2	1	0	0
mayor	1	1	0	0	0	0	0
player	0	0	0	0	1	1	2
team	0	0	0	0	1	1	0
term	2	1	1	2	0	0	1
vote	0	0	1	0	0	0	1
win	0	1	1	0	1	2	0

Multinomial Naive Bayes (Basics)¶

Difference from General Naive Bayes¶

Importance of Learning Multinomial Naive Bayes¶

Setting up the Notebook¶

Make Required Imports¶

Motivating Example¶

Formalization¶

Basic Definitions¶

Ignoring the Marginals¶

The "Naive" Assumption¶

Calculating Probabilities¶

Zero Probabilities & Smoothing¶

Log Probabilities¶

Worked Example¶

Data Preprocessing¶

Calculating Priors¶

Calculating Likelihoods¶

Making Predictions¶

Practical Application¶

Creating a Feature Matrix¶

Training the Classifier¶

Making Predictions¶

Discussion & Considerations¶

Independence Assumption¶

Beyond Unigrams¶

Beyond Simple Counts¶

Summary¶

	d1	d2	d3	d4	d5	d6	d7
and	1	0	0	0	0	0	0
and next	1	0	0	0	0	0	0
and next term	1	0	0	0	0	0	0
another	0	0	0	0	0	0	1
another term	0	0	0	0	0	0	1
...	...	...	...	...	...	...	...
win for	0	0	0	0	0	2	0
win for each	0	0	0	0	0	1	0
win for the	0	0	0	0	0	1	0
win the	0	0	1	0	0	0	0
win the vote	0	0	1	0	0	0	0

	d1	d2	d3	d4	d5	d6	d7
and	0.21558	0.0	0.000000	0.0	0.0	0.000000	0.000000
and next	0.21558	0.0	0.000000	0.0	0.0	0.000000	0.000000
and next term	0.21558	0.0	0.000000	0.0	0.0	0.000000	0.000000
another	0.00000	0.0	0.000000	0.0	0.0	0.000000	0.233945
another term	0.00000	0.0	0.000000	0.0	0.0	0.000000	0.233945
...	...	...	...	...	...	...	...
win for	0.00000	0.0	0.000000	0.0	0.0	0.421222	0.000000
win for each	0.00000	0.0	0.000000	0.0	0.0	0.210611	0.000000
win for the	0.00000	0.0	0.000000	0.0	0.0	0.210611	0.000000
win the	0.00000	0.0	0.222777	0.0	0.0	0.000000	0.000000
win the vote	0.00000	0.0	0.222777	0.0	0.0	0.000000	0.000000