-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathcngram.py
106 lines (91 loc) · 3.66 KB
/
cngram.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# cngram.py
# Using Python 3.4.3
import math
from collections import defaultdict, Counter
class CNGram:
"""Represents a section of text with n characters.
Args:
lang (str): The language of the n-gram model
words (list<str>): Tokenized words for a single language.
num_letters (int): Number of letters in the original text. The default
is 26.
n (int, optional): The length of the n-gram. The default is 5.
Properties:
lang (str): n-gram language
words (list<str>): Tokenized words for a single language.
num_letters (int): Number of letters in the original text. The default
is 26.
n (int, optional): The length of the n-gram. The default is 5.
"""
def __init__(self, lang, words, num_letters=26, n=5):
self.lang = lang
self.cond_cnts = get_cond_cnts(words, n)
self.num_letters = num_letters
self.n = n
self._normalize_counts()
def _normalize_counts(self):
"""Normalizes the counts within the n-gram's cond_cnts"""
for ctx, cnts in self.cond_cnts.items():
for lastc, cnt in cnts.items():
ctx_size = len(self.cond_cnts[ctx])
self.cond_cnts[ctx][lastc] = (cnt + 1)/float(ctx_size + self.num_letters)
def ngram_prob(self, ctx, c):
"""Using conditional frequency distribution, calculate p(c | ctx).
Return:
float: P(c | ctx)
"""
# Return the counts divided by the number of letters
ctx_size = len(self.cond_cnts[ctx])
return self.cond_cnts.get(ctx, {}).get(c, 1.0/float(ctx_size + self.num_letters))
def word_prob(self, word):
""" Multiply n-gram probabilites for each n-gram in word
Return:
float: Probability of a certain word
"""
prob = 1.0
for ctx, cnts in get_cond_cnts([word], self.n).items():
for lastchar, cnt in cnts.items():
prob *= self.ngram_prob(ctx, lastchar) * cnt
return math.log(prob)
def get_ngrams(word, n):
"""Splits word into character n-grams of length n.
The provided string is padded with spaces at the beginning and end.
Args:
word(str): the training data
n (int): the n-gram length
Return:
list<str>: n-gram strings
"""
pad = " " * (n - 1)
word = pad + word + pad
return (word[i:i+n] for i in range(len(word) - n + 1))
def get_cond_cnts(words, n):
""" Creates the conditional frequency distribution.
The process is as follows:
for each word,
get n-grams using above method
group by first n-1 characters (context)
for each unique context, create a map of last character counts
e.g., if you have the n-grams "chair" and "chain":
the context would be "chai" and the last character counts
would be "r -> 1" and "n -> 1"
add the [ context -> last character counts ] mapping to your
ultimate CFD (also a map)
e.g., the CFD would have lots of entries that look like:
"chai" -> ["r" -> 1, "n" -> 1]
as an key-value pair.
Args:
words (list<str>): words from the training data
n (int): length of an n-gram
Return:
dict<str, dict<str, int>: a mapping of context (unique substring of
the first n-1 characters) to endings (last character) and their
frequencies
"""
cond_cnts = defaultdict(Counter)
for words in words:
ngrams = get_ngrams(words, n)
for ngram in ngrams:
ctx, lastc = ngram[:n - 1], ngram[-1]
cond_cnts[ctx][lastc] += 1
return cond_cnts