Probabilities Application: Letter Frequencies#
FIZ371 - Scientific & Technical Calculations | 11/10/2023
Emre S. Tasci emre.tasci@hacettepe.edu.tr
Using L. Frank Baum’s “The Wonderful Wizard of Oz” book, calculate the frequencies of the letters & bigrams.
(The book, written in 1900, is now in public domain and available from Project Gutenberg)
import numpy as np
We first read the text into the data
variable and define the set of letters we are interested in:
fname = "supp/wizardofoz_1990_publicdomain_guthenberg.txt"
alphabet_str = "abcdefghijklmnopqrstuvwxyz "
# Convert the alphabet_str to alphabet array
alphabet = [*alphabet_str]
with open(fname, 'r') as myfile:
data=myfile.read().replace('\n', ' ')
# Convert it to all lowercase
data = data.lower()
Letters#
Count and store the frequencies into the count_letter
dictionary:
count_letter = {}
for letter in alphabet:
count_letter[letter] = data.count(letter)
count_letter
{'a': 13887,
'b': 2448,
'c': 4161,
'd': 8888,
'e': 22261,
'f': 3588,
'g': 3587,
'h': 11977,
'i': 10350,
'j': 240,
'k': 1920,
'l': 7256,
'm': 3830,
'n': 10752,
'o': 14240,
'p': 2466,
'q': 144,
'r': 10534,
's': 9689,
't': 16378,
'u': 4600,
'v': 1290,
'w': 5229,
'x': 209,
'y': 4128,
'z': 277,
' ': 44331}
To calculate the probabilities, we divide each frequency by the total sum:
tot_count = np.sum(list(count_letter.values()))
print(tot_count)
218660
probs_letter = {}
for letter in count_letter.keys():
probs_letter[letter] = count_letter[letter] / tot_count
probs_letter
{'a': 0.06350955821823837,
'b': 0.0111954632763194,
'c': 0.019029543583645843,
'd': 0.04064758071892436,
'e': 0.1018064575139486,
'f': 0.016409036860879904,
'g': 0.01640446355071801,
'h': 0.05477453580901857,
'i': 0.04733376017561511,
'j': 0.0010975944388548432,
'k': 0.008780755510838746,
'l': 0.033183938534711424,
'm': 0.01751577792005854,
'n': 0.04917223086069697,
'o': 0.06512393670538735,
'p': 0.011277782859233513,
'q': 0.0006585566633129059,
'r': 0.048175249245403826,
's': 0.0443108021586024,
't': 0.07490167383151926,
'u': 0.021037226744717828,
'v': 0.005899570108844782,
'w': 0.023913838836549895,
'x': 0.0009558218238360926,
'y': 0.018878624348303303,
'z': 0.0012668069148449649,
' ': 0.20273941278697521}
Bigrams#
We are going to generate all possible bigrams and count them, storing in count_bigram
:
count_bigram = {}
total_sum_bigram = 0
for x in alphabet:
for y in alphabet:
bigram = x+y
count_bigram[bigram] = data.count(bigram)
total_sum_bigram += count_bigram[bigram]
count_bigram
{'aa': 0,
'ab': 124,
'ac': 350,
'ad': 789,
'ae': 1,
'af': 143,
'ag': 264,
'ah': 12,
'ai': 790,
'aj': 2,
'ak': 156,
'al': 999,
'am': 310,
'an': 3170,
'ao': 0,
'ap': 218,
'aq': 0,
'ar': 1552,
'as': 1506,
'at': 1442,
'au': 147,
'av': 343,
'aw': 223,
'ax': 36,
'ay': 377,
'az': 12,
'a ': 897,
'ba': 254,
'bb': 9,
'bc': 0,
'bd': 0,
'be': 857,
'bf': 0,
'bg': 0,
'bh': 0,
'bi': 120,
'bj': 2,
'bk': 0,
'bl': 191,
'bm': 0,
'bn': 0,
'bo': 207,
'bp': 0,
'bq': 0,
'br': 243,
'bs': 7,
'bt': 6,
'bu': 393,
'bv': 0,
'bw': 0,
'bx': 0,
'by': 147,
'bz': 0,
'b ': 7,
'ca': 798,
'cb': 0,
'cc': 26,
'cd': 0,
'ce': 470,
'cf': 0,
'cg': 0,
'ch': 699,
'ci': 150,
'cj': 0,
'ck': 377,
'cl': 189,
'cm': 0,
'cn': 0,
'co': 692,
'cp': 0,
'cq': 0,
'cr': 394,
'cs': 0,
'ct': 232,
'cu': 67,
'cv': 0,
'cw': 0,
'cx': 0,
'cy': 21,
'cz': 0,
'c ': 41,
'da': 199,
'db': 0,
'dc': 6,
'dd': 73,
'de': 715,
'df': 12,
'dg': 19,
'dh': 1,
'di': 358,
'dj': 0,
'dk': 1,
'dl': 130,
'dm': 189,
'dn': 16,
'do': 781,
'dp': 0,
'dq': 0,
'dr': 94,
'ds': 170,
'dt': 0,
'du': 29,
'dv': 7,
'dw': 5,
'dx': 0,
'dy': 57,
'dz': 0,
'd ': 5375,
'ea': 1165,
'eb': 28,
'ec': 576,
'ed': 2124,
'ee': 806,
'ef': 184,
'eg': 88,
'eh': 27,
'ei': 175,
'ej': 2,
'ek': 17,
'el': 660,
'em': 512,
'en': 1363,
'eo': 68,
'ep': 245,
'eq': 14,
'er': 2876,
'es': 1090,
'et': 472,
'eu': 5,
'ev': 279,
'ew': 147,
'ex': 105,
'ey': 568,
'ez': 1,
'e ': 7884,
'fa': 192,
'fb': 0,
'fc': 0,
'fd': 0,
'fe': 246,
'ff': 97,
'fg': 0,
'fh': 0,
'fi': 234,
'fj': 0,
'fk': 0,
'fl': 113,
'fm': 0,
'fn': 0,
'fo': 749,
'fp': 0,
'fq': 0,
'fr': 298,
'fs': 1,
'ft': 157,
'fu': 214,
'fv': 0,
'fw': 1,
'fx': 0,
'fy': 2,
'fz': 0,
'f ': 1214,
'ga': 222,
'gb': 0,
'gc': 0,
'gd': 2,
'ge': 410,
'gf': 0,
'gg': 35,
'gh': 444,
'gi': 198,
'gj': 0,
'gk': 0,
'gl': 84,
'gm': 0,
'gn': 4,
'go': 288,
'gp': 0,
'gq': 0,
'gr': 485,
'gs': 75,
'gt': 3,
'gu': 135,
'gv': 0,
'gw': 0,
'gx': 0,
'gy': 5,
'gz': 0,
'g ': 964,
'ha': 1515,
'hb': 0,
'hc': 0,
'hd': 0,
'he': 6611,
'hf': 2,
'hg': 0,
'hh': 0,
'hi': 1157,
'hj': 0,
'hk': 34,
'hl': 5,
'hm': 0,
'hn': 0,
'ho': 688,
'hp': 0,
'hq': 0,
'hr': 142,
'hs': 21,
'ht': 299,
'hu': 69,
'hv': 0,
'hw': 2,
'hx': 0,
'hy': 416,
'hz': 0,
'h ': 846,
'ia': 59,
'ib': 89,
'ic': 423,
'id': 677,
'ie': 424,
'if': 266,
'ig': 309,
'ih': 0,
'ii': 22,
'ij': 0,
'ik': 71,
'il': 624,
'im': 386,
'in': 2595,
'io': 393,
'ip': 42,
'iq': 0,
'ir': 440,
'is': 1048,
'it': 1504,
'iu': 5,
'iv': 207,
'iw': 0,
'ix': 7,
'iy': 0,
'iz': 68,
'i ': 611,
'ja': 8,
'jb': 0,
'jc': 0,
'jd': 0,
'je': 97,
'jf': 0,
'jg': 0,
'jh': 0,
'ji': 0,
'jj': 0,
'jk': 0,
'jl': 0,
'jm': 0,
'jn': 0,
'jo': 75,
'jp': 0,
'jq': 0,
'jr': 0,
'js': 0,
'jt': 0,
'ju': 60,
'jv': 0,
'jw': 0,
'jx': 0,
'jy': 0,
'jz': 0,
'j ': 0,
'ka': 62,
'kb': 0,
'kc': 0,
'kd': 0,
'ke': 741,
'kf': 10,
'kg': 0,
'kh': 0,
'ki': 269,
'kj': 0,
'kk': 0,
'kl': 27,
'km': 7,
'kn': 110,
'ko': 1,
'kp': 0,
'kq': 0,
'kr': 0,
'ks': 82,
'kt': 0,
'ku': 0,
'kv': 0,
'kw': 3,
'kx': 0,
'ky': 20,
'kz': 0,
'k ': 455,
'la': 425,
'lb': 2,
'lc': 6,
'ld': 637,
'le': 1083,
'lf': 90,
'lg': 2,
'lh': 0,
'li': 828,
'lj': 0,
'lk': 117,
'll': 1165,
'lm': 12,
'ln': 1,
'lo': 637,
'lp': 52,
'lq': 0,
'lr': 4,
'ls': 77,
'lt': 71,
'lu': 72,
'lv': 58,
'lw': 21,
'lx': 0,
'ly': 628,
'lz': 0,
'l ': 1049,
'ma': 770,
'mb': 60,
'mc': 0,
'md': 0,
'me': 948,
'mf': 11,
'mg': 0,
'mh': 3,
'mi': 241,
'mj': 0,
'mk': 0,
'ml': 9,
'mm': 34,
'mn': 6,
'mo': 279,
'mp': 88,
'mq': 0,
'mr': 13,
'ms': 119,
'mt': 0,
'mu': 174,
'mv': 0,
'mw': 0,
'mx': 0,
'my': 221,
'mz': 0,
'm ': 654,
'na': 99,
'nb': 100,
'nc': 242,
'nd': 2476,
'ne': 816,
'nf': 23,
'ng': 1192,
'nh': 13,
'ni': 221,
'nj': 5,
'nk': 204,
'nl': 102,
'nm': 1,
'nn': 85,
'no': 752,
'np': 2,
'nq': 14,
'nr': 14,
'ns': 393,
'nt': 691,
'nu': 40,
'nv': 7,
'nw': 1,
'nx': 15,
'ny': 194,
'nz': 0,
'n ': 2485,
'oa': 108,
'ob': 35,
'oc': 67,
'od': 407,
'oe': 62,
'of': 1044,
'og': 50,
'oh': 40,
'oi': 134,
'oj': 88,
'ok': 205,
'ol': 320,
'om': 580,
'on': 1549,
'oo': 808,
'op': 245,
'oq': 7,
'or': 1641,
'os': 200,
'ot': 1062,
'ou': 1952,
'ov': 196,
'ow': 979,
'ox': 6,
'oy': 43,
'oz': 171,
'o ': 2101,
'pa': 220,
'pb': 8,
'pc': 0,
'pd': 2,
'pe': 375,
'pf': 0,
'pg': 2,
'ph': 28,
'pi': 115,
'pj': 0,
'pk': 0,
'pl': 287,
'pm': 3,
'pn': 0,
'po': 281,
'pp': 166,
'pq': 0,
'pr': 337,
'ps': 35,
'pt': 117,
'pu': 77,
'pv': 0,
'pw': 1,
'px': 0,
'py': 67,
'pz': 0,
'p ': 265,
'qa': 0,
'qb': 0,
'qc': 0,
'qd': 0,
'qe': 0,
'qf': 0,
'qg': 0,
'qh': 0,
'qi': 0,
'qj': 0,
'qk': 0,
'ql': 0,
'qm': 0,
'qn': 0,
'qo': 0,
'qp': 0,
'qq': 0,
'qr': 0,
'qs': 0,
'qt': 0,
'qu': 139,
'qv': 0,
'qw': 0,
'qx': 0,
'qy': 0,
'qz': 0,
'q ': 3,
'ra': 654,
'rb': 9,
'rc': 71,
'rd': 272,
're': 2572,
'rf': 57,
'rg': 137,
'rh': 10,
'ri': 608,
'rj': 0,
'rk': 199,
'rl': 165,
'rm': 181,
'rn': 200,
'ro': 1537,
'rp': 63,
'rq': 0,
'rr': 182,
'rs': 333,
'rt': 334,
'ru': 153,
'rv': 16,
'rw': 16,
'rx': 0,
'ry': 387,
'rz': 0,
'r ': 2002,
'sa': 599,
'sb': 1,
'sc': 302,
'sd': 0,
'se': 943,
'sf': 4,
'sg': 1,
'sh': 870,
'si': 281,
'sj': 0,
'sk': 197,
'sl': 96,
'sm': 58,
'sn': 25,
'so': 607,
'sp': 128,
'sq': 6,
'sr': 0,
'ss': 284,
'st': 1202,
'su': 190,
'sv': 0,
'sw': 122,
'sx': 0,
'sy': 16,
'sz': 0,
's ': 3081,
'ta': 308,
'tb': 0,
'tc': 182,
'td': 1,
'te': 1125,
'tf': 13,
'tg': 0,
'th': 6453,
'ti': 721,
'tj': 0,
'tk': 0,
'tl': 272,
'tm': 3,
'tn': 4,
'to': 1762,
'tp': 0,
'tq': 0,
'tr': 497,
'ts': 178,
'tt': 304,
'tu': 159,
'tv': 0,
'tw': 44,
'tx': 0,
'ty': 187,
'tz': 0,
't ': 3602,
'ua': 50,
'ub': 26,
'uc': 128,
'ud': 62,
'ue': 117,
'uf': 34,
'ug': 250,
'uh': 0,
'ui': 108,
'uj': 0,
'uk': 0,
'ul': 547,
'um': 62,
'un': 615,
'uo': 0,
'up': 248,
'uq': 0,
'ur': 503,
'us': 462,
'ut': 802,
'uu': 0,
'uv': 0,
'uw': 0,
'ux': 0,
'uy': 0,
'uz': 7,
'u ': 490,
'va': 13,
'vb': 0,
'vc': 0,
'vd': 0,
've': 1105,
'vf': 0,
'vg': 0,
'vh': 0,
'vi': 108,
'vj': 0,
'vk': 0,
'vl': 0,
'vm': 0,
'vn': 0,
'vo': 45,
'vp': 0,
'vq': 0,
'vr': 0,
'vs': 0,
'vt': 0,
'vu': 0,
'vv': 0,
'vw': 0,
'vx': 0,
'vy': 9,
'vz': 0,
'v ': 5,
'wa': 980,
'wb': 0,
'wc': 0,
'wd': 5,
'we': 794,
'wf': 9,
'wg': 0,
'wh': 711,
'wi': 1031,
'wj': 0,
'wk': 2,
'wl': 24,
'wm': 0,
'wn': 163,
'wo': 616,
'wp': 0,
'wq': 0,
'wr': 19,
'ws': 50,
'wt': 0,
'wu': 2,
'wv': 0,
'ww': 9,
'wx': 0,
'wy': 0,
'wz': 0,
'w ': 588,
'xa': 8,
'xb': 0,
'xc': 29,
'xd': 0,
'xe': 36,
'xf': 0,
'xg': 0,
'xh': 1,
'xi': 36,
'xj': 0,
'xk': 0,
'xl': 0,
'xm': 0,
'xn': 0,
'xo': 0,
'xp': 24,
'xq': 0,
'xr': 0,
'xs': 0,
'xt': 37,
'xu': 0,
'xv': 8,
'xw': 0,
'xx': 10,
'xy': 0,
'xz': 0,
'x ': 12,
'ya': 16,
'yb': 3,
'yc': 16,
'yd': 0,
'ye': 182,
'yf': 4,
'yg': 0,
'yh': 0,
'yi': 61,
'yj': 0,
'yk': 0,
'yl': 3,
'ym': 4,
'yn': 1,
'yo': 703,
'yp': 3,
'yq': 0,
'yr': 20,
'ys': 101,
'yt': 41,
'yu': 0,
'yv': 0,
'yw': 12,
'yx': 0,
'yy': 0,
'yz': 0,
'y ': 2351,
'za': 47,
'zb': 0,
'zc': 0,
'zd': 0,
'ze': 29,
'zf': 0,
'zg': 0,
'zh': 0,
'zi': 6,
'zj': 0,
'zk': 0,
'zl': 6,
'zm': 0,
'zn': 0,
'zo': 0,
'zp': 0,
'zq': 0,
'zr': 0,
'zs': 0,
'zt': 0,
'zu': 2,
'zv': 0,
'zw': 0,
'zx': 0,
'zy': 7,
'zz': 8,
'z ': 93,
' a': 5006,
' b': 1847,
' c': 1691,
' d': 1331,
' e': 667,
' f': 1531,
' g': 1208,
' h': 2599,
' i': 2128,
' j': 139,
' k': 335,
' l': 1203,
' m': 1441,
' n': 808,
' o': 2198,
' p': 918,
' q': 100,
' r': 765,
' s': 3324,
' t': 7379,
' u': 514,
' v': 154,
' w': 3466,
' x': 30,
' y': 705,
' z': 4,
' ': 1401}
Exercise#
Calculate the probabilities:
p_b = probs_letter["b"]
p_b
0.0111954632763194
p_an = count_bigram["an"] / total_sum_bigram
p_an
0.015006982744336875
Exercise#
Calculate the probabilities:
\(p(x="a"|y="n")\)#
To calculate the first probability, we need to define a subset that contains all the bigrams whose second letter is equal to “n”:
subset_x_yn = {}
for x in alphabet:
bigram = x + "n"
subset_x_yn[bigram] = count_bigram[bigram]
subset_x_yn
{'an': 3170,
'bn': 0,
'cn': 0,
'dn': 16,
'en': 1363,
'fn': 0,
'gn': 4,
'hn': 0,
'in': 2595,
'jn': 0,
'kn': 110,
'ln': 1,
'mn': 6,
'nn': 85,
'on': 1549,
'pn': 0,
'qn': 0,
'rn': 200,
'sn': 25,
'tn': 4,
'un': 615,
'vn': 0,
'wn': 163,
'xn': 0,
'yn': 1,
'zn': 0,
' n': 808}
The total of the subset, sum_subset_x_yn
is:
subset_x_yn.values()
dict_values([3170, 0, 0, 16, 1363, 0, 4, 0, 2595, 0, 110, 1, 6, 85, 1549, 0, 0, 200, 25, 4, 615, 0, 163, 0, 1, 0, 808])
# We are converting the values to a numpy array
# to directly evaluate using the 'sum()' method:
array_1 = np.array(list(subset_x_yn.values()))
array_1
array([3170, 0, 0, 16, 1363, 0, 4, 0, 2595, 0, 110,
1, 6, 85, 1549, 0, 0, 200, 25, 4, 615, 0,
163, 0, 1, 0, 808])
total_subset_x_yn = array_1.sum()
total_subset_x_yn
10715
Thus, \(p(x="a"|y="n")\) probability is:
prob_xa_yn = subset_x_yn["an"] / total_subset_x_yn
prob_xa_yn
0.2958469435370975
\(p(y="n"|x="a")\)#
For the second probability, we construct a new subset that contains the bigrams whose first letter is “a”:
subset_y_xa = {}
for y in alphabet:
bigram = "a" + y
subset_y_xa[bigram] = count_bigram[bigram]
subset_y_xa
{'aa': 0,
'ab': 124,
'ac': 350,
'ad': 789,
'ae': 1,
'af': 143,
'ag': 264,
'ah': 12,
'ai': 790,
'aj': 2,
'ak': 156,
'al': 999,
'am': 310,
'an': 3170,
'ao': 0,
'ap': 218,
'aq': 0,
'ar': 1552,
'as': 1506,
'at': 1442,
'au': 147,
'av': 343,
'aw': 223,
'ax': 36,
'ay': 377,
'az': 12,
'a ': 897}
The rest is similar to the first one:
array_2 = np.array(list(subset_y_xa.values()))
array_2
array([ 0, 124, 350, 789, 1, 143, 264, 12, 790, 2, 156,
999, 310, 3170, 0, 218, 0, 1552, 1506, 1442, 147, 343,
223, 36, 377, 12, 897])
total_subset_y_xa = array_2.sum()
total_subset_y_xa
13863
prob_yn_xa = subset_y_xa["an"] / total_subset_y_xa
prob_yn_xa
0.22866623385991489
Exercise#
Calculate the probability:
For this one, we have all the factors:
prob_yn_xa * probs_letter["a"] / probs_letter["n"]
0.29533928474819926
Let’s check if this is equal to \(p(x="a"|y="n")\) as Bayes Theorem dictates:
prob_xa_yn
0.2958469435370975
The two values are close but not equal. I hope that you are able to figure out the reason for this difference. If not, please ponder on it a while before proceeding! 8)
(Spoilers ahead!)
The reason is due to the characters unaccounted for (e.g., “d.”). Consider the following paragraph:
“My darling child!” she cried, folding the little girl in her arms and covering her face with kisses. “Where in the world did you come from?”
If we were to calculate the marginal probability of \(p("n")\), for example, we would first count all the occurences of the letter “n” and then divide by all the characters included in the text:
text = '''"My darling child!" she cried, folding the little girl in her arms and
covering her face with kisses. "Where in the world did you come from?"'''
text = text.lower()
print(text)
"my darling child!" she cried, folding the little girl in her arms and
covering her face with kisses. "where in the world did you come from?"
count_n = text.count("n")
count_n
6
count_all = len(text)
count_all
141
p_n = count_n / count_all
p_n
0.0425531914893617
However, this marginal probability includes the uncounted characters such as {”’”,”””,”,”,”.”,”!”} which aren’t accounted for when we were calculating the bigram probabilities, thus messing with our calculations. This also goes for the bigram probabilities, thus causing an inconsistency.
To remedy this issue, we should have excluded all the characters except the ones we had in our alphabet
before we had started. This filtering can be done via the regular expression module re’s sub()
method:
import re
text_filtered = re.sub('[^a-z ]',' ',text)
print(text_filtered)
my darling child she cried folding the little girl in her arms and covering her face with kisses where in the world did you come from
Although the regular expressions is a whole topic by itself, to briefly explain, we are first defining a range inside the square brackets to include any letter from a to z ([a-z]), and then also add the space character to this range ([a-z ]). But since these are the characters we want to keep, we negate our statement by putting the negation sign (“^”) to mean “every character that is not in this range” ([^a-z ]). The second parameter is the replacement, by putting in ” “, we are saying that replace all the matching characters with space. The third parameter is the text we want to operate on.
Thus, we end up with a text that only contains the characters we are taking into account.
Below, we’ll repeat the same procedures we did above, only this time we’ll filter our text to only include the letters from a to z and the space character:
with open(fname, 'r') as myfile:
data=myfile.read().replace('\n', ' ')
# Convert it to all lowercase
data = data.lower()
data = re.sub('[^a-z ]',' ',data)
count_letter = {}
for letter in alphabet:
count_letter[letter] = data.count(letter)
tot_count = np.sum(list(count_letter.values()))
probs_letter = {}
for letter in count_letter.keys():
probs_letter[letter] = count_letter[letter] / tot_count
count_bigram = {}
total_sum_bigram = 0
for x in alphabet:
for y in alphabet:
bigram = x+y
count_bigram[bigram] = data.count(bigram)
total_sum_bigram += count_bigram[bigram]
p_b = probs_letter["b"]
p_an = count_bigram["an"] / total_sum_bigram
subset_x_yn = {}
for x in alphabet:
bigram = x + "n"
subset_x_yn[bigram] = count_bigram[bigram]
array_1 = np.array(list(subset_x_yn.values()))
total_subset_x_yn = array_1.sum()
prob_xa_yn = subset_x_yn["an"] / total_subset_x_yn
subset_y_xa = {}
for y in alphabet:
bigram = "a" + y
subset_y_xa[bigram] = count_bigram[bigram]
array_2 = np.array(list(subset_y_xa.values()))
total_subset_y_xa = array_2.sum()
prob_yn_xa = subset_y_xa["an"] / total_subset_y_xa
prob_yn_xa * probs_letter["a"] / probs_letter["n"]
0.29482886904761907
prob_xa_yn
0.29482886904761907
Thus, we have saved the Bayes Theorem! 8)
Visualizing the frequencies#
Using the pandas module, we can also “visualize” the frequencies:
import pandas as pd
df_letter = pd.DataFrame(count_letter.values(),
index=alphabet,
columns=["count"])
df_letter.style.background_gradient(cmap="binary")
#https://matplotlib.org/stable/tutorials/colors/colormaps.html
# https://stackoverflow.com/a/50605020
count | |
---|---|
a | 13887 |
b | 2448 |
c | 4161 |
d | 8888 |
e | 22261 |
f | 3588 |
g | 3587 |
h | 11977 |
i | 10350 |
j | 240 |
k | 1920 |
l | 7256 |
m | 3830 |
n | 10752 |
o | 14240 |
p | 2466 |
q | 144 |
r | 10534 |
s | 9689 |
t | 16378 |
u | 4600 |
v | 1290 |
w | 5229 |
x | 209 |
y | 4128 |
z | 277 |
52771 |
df = pd.DataFrame({"l1":[],"l2":[],"count":[]})
for letter1 in alphabet:
for letter2 in alphabet:
word = letter1+letter2
df.loc[-1] = [letter1,letter2,data.count(word)]
df.index = df.index + 1
df.insert(loc=2,column="word",value=df["l1"]+df["l2"])
df
l1 | l2 | word | count | |
---|---|---|---|---|
728 | a | a | aa | 0 |
727 | a | b | ab | 124 |
726 | a | c | ac | 350 |
725 | a | d | ad | 789 |
724 | a | e | ae | 1 |
... | ... | ... | ... | ... |
4 | w | w | 3634 | |
3 | x | x | 30 | |
2 | y | y | 756 | |
1 | z | z | 10 | |
0 | 6884 |
729 rows × 4 columns
len_alpha = len(alphabet)
mat = np.zeros((len_alpha,len_alpha),int)
for i in range(len_alpha):
for j in range(len_alpha):
mat[i,j] = df.loc[df.word==(alphabet[i]+alphabet[j]),"count"]
df_bi = pd.DataFrame(mat,index=alphabet,columns=alphabet)
import matplotlib.pyplot as plt
from matplotlib import colors
def background_gradient(s, m, M, cmap='PuBu', low=0, high=0):
rng = M - m
norm = colors.Normalize(m - (rng * low),
M + (rng * high))
normed = norm(s.values)
c = [colors.rgb2hex(x) for x in plt.cm.get_cmap(cmap)(normed)]
return ['background-color: %s' % color for color in c]
df_bi.style.apply(background_gradient,cmap="binary",
m=df_bi.min().min(),
M=df_bi.max().max(),
low=0,
high=0.85)
# https://stackoverflow.com/a/42563850
a | b | c | d | e | f | g | h | i | j | k | l | m | n | o | p | q | r | s | t | u | v | w | x | y | z | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
a | 0 | 124 | 350 | 789 | 1 | 143 | 264 | 12 | 790 | 2 | 156 | 999 | 310 | 3170 | 0 | 218 | 0 | 1552 | 1506 | 1442 | 147 | 343 | 223 | 36 | 377 | 12 | 921 |
b | 254 | 9 | 0 | 0 | 857 | 0 | 0 | 0 | 120 | 2 | 0 | 191 | 0 | 0 | 207 | 0 | 0 | 243 | 7 | 6 | 393 | 0 | 0 | 0 | 147 | 0 | 12 |
c | 798 | 0 | 26 | 0 | 470 | 0 | 0 | 699 | 150 | 0 | 377 | 189 | 0 | 0 | 692 | 0 | 0 | 394 | 0 | 232 | 67 | 0 | 0 | 0 | 21 | 0 | 46 |
d | 199 | 0 | 6 | 73 | 715 | 12 | 19 | 1 | 358 | 0 | 1 | 130 | 189 | 16 | 781 | 0 | 0 | 94 | 170 | 0 | 29 | 7 | 5 | 0 | 57 | 0 | 6026 |
e | 1165 | 28 | 576 | 2124 | 806 | 184 | 88 | 27 | 175 | 2 | 17 | 660 | 512 | 1363 | 68 | 245 | 14 | 2876 | 1090 | 472 | 5 | 279 | 147 | 105 | 568 | 1 | 8664 |
f | 192 | 0 | 0 | 0 | 246 | 97 | 0 | 0 | 234 | 0 | 0 | 113 | 0 | 0 | 749 | 0 | 0 | 298 | 1 | 157 | 214 | 0 | 1 | 0 | 2 | 0 | 1284 |
g | 222 | 0 | 0 | 2 | 410 | 0 | 35 | 444 | 198 | 0 | 0 | 84 | 0 | 4 | 288 | 0 | 0 | 485 | 75 | 3 | 135 | 0 | 0 | 0 | 5 | 0 | 1197 |
h | 1515 | 0 | 0 | 0 | 6611 | 2 | 0 | 0 | 1157 | 0 | 34 | 5 | 0 | 0 | 688 | 0 | 0 | 142 | 21 | 299 | 69 | 0 | 2 | 0 | 416 | 0 | 1016 |
i | 59 | 89 | 423 | 677 | 424 | 266 | 309 | 0 | 22 | 0 | 71 | 624 | 386 | 2595 | 393 | 42 | 0 | 440 | 1048 | 1504 | 5 | 207 | 0 | 7 | 0 | 68 | 681 |
j | 8 | 0 | 0 | 0 | 97 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 75 | 0 | 0 | 0 | 0 | 0 | 60 | 0 | 0 | 0 | 0 | 0 | 0 |
k | 62 | 0 | 0 | 0 | 741 | 10 | 0 | 0 | 269 | 0 | 0 | 27 | 7 | 110 | 1 | 0 | 0 | 0 | 82 | 0 | 0 | 0 | 3 | 0 | 20 | 0 | 588 |
l | 425 | 2 | 6 | 637 | 1083 | 90 | 2 | 0 | 828 | 0 | 117 | 1165 | 12 | 1 | 637 | 52 | 0 | 4 | 77 | 71 | 72 | 58 | 21 | 0 | 628 | 0 | 1268 |
m | 770 | 60 | 0 | 0 | 948 | 11 | 0 | 3 | 241 | 0 | 0 | 9 | 34 | 6 | 279 | 88 | 0 | 13 | 119 | 0 | 174 | 0 | 0 | 0 | 221 | 0 | 854 |
n | 99 | 100 | 242 | 2476 | 816 | 23 | 1192 | 13 | 221 | 5 | 204 | 102 | 1 | 85 | 752 | 2 | 14 | 14 | 393 | 691 | 40 | 7 | 1 | 15 | 194 | 0 | 3050 |
o | 108 | 35 | 67 | 407 | 62 | 1044 | 50 | 40 | 134 | 88 | 205 | 320 | 580 | 1549 | 808 | 245 | 7 | 1641 | 200 | 1062 | 1952 | 196 | 979 | 6 | 43 | 171 | 2241 |
p | 220 | 8 | 0 | 2 | 375 | 0 | 2 | 28 | 115 | 0 | 0 | 287 | 3 | 0 | 281 | 166 | 0 | 337 | 35 | 117 | 77 | 0 | 1 | 0 | 67 | 0 | 345 |
q | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 139 | 0 | 0 | 0 | 0 | 0 | 5 |
r | 654 | 9 | 71 | 272 | 2572 | 57 | 137 | 10 | 608 | 0 | 199 | 165 | 181 | 200 | 1537 | 63 | 0 | 182 | 333 | 334 | 153 | 16 | 16 | 0 | 387 | 0 | 2378 |
s | 599 | 1 | 302 | 0 | 943 | 4 | 1 | 870 | 281 | 0 | 197 | 96 | 58 | 25 | 607 | 128 | 6 | 0 | 284 | 1202 | 190 | 0 | 122 | 0 | 16 | 0 | 3757 |
t | 308 | 0 | 182 | 1 | 1125 | 13 | 0 | 6453 | 721 | 0 | 0 | 272 | 3 | 4 | 1762 | 0 | 0 | 497 | 178 | 304 | 159 | 0 | 44 | 0 | 187 | 0 | 4165 |
u | 50 | 26 | 128 | 62 | 117 | 34 | 250 | 0 | 108 | 0 | 0 | 547 | 62 | 615 | 0 | 248 | 0 | 503 | 462 | 802 | 0 | 0 | 0 | 0 | 0 | 7 | 579 |
v | 13 | 0 | 0 | 0 | 1105 | 0 | 0 | 0 | 108 | 0 | 0 | 0 | 0 | 0 | 45 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9 | 0 | 10 |
w | 980 | 0 | 0 | 5 | 794 | 9 | 0 | 711 | 1031 | 0 | 2 | 24 | 0 | 163 | 616 | 0 | 0 | 19 | 50 | 0 | 2 | 0 | 9 | 0 | 0 | 0 | 805 |
x | 8 | 0 | 29 | 0 | 36 | 0 | 0 | 1 | 36 | 0 | 0 | 0 | 0 | 0 | 0 | 24 | 0 | 0 | 0 | 37 | 0 | 8 | 0 | 10 | 0 | 0 | 20 |
y | 16 | 3 | 16 | 0 | 182 | 4 | 0 | 0 | 61 | 0 | 0 | 3 | 4 | 1 | 703 | 3 | 0 | 20 | 101 | 41 | 0 | 0 | 12 | 0 | 0 | 0 | 2958 |
z | 47 | 0 | 0 | 0 | 29 | 0 | 0 | 0 | 6 | 0 | 0 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 7 | 8 | 172 |
5116 | 1954 | 1737 | 1361 | 696 | 1585 | 1238 | 2665 | 2368 | 141 | 340 | 1238 | 1488 | 845 | 2271 | 942 | 103 | 780 | 3457 | 7602 | 516 | 169 | 3634 | 30 | 756 | 10 | 6884 |