Probabilities Application: Letter Frequencies#

FIZ371 - Scientific & Technical Calculations | 11/10/2023

Emre S. Tasci emre.tasci@hacettepe.edu.tr

Using L. Frank Baum’s “The Wonderful Wizard of Oz” book, calculate the frequencies of the letters & bigrams.

(The book, written in 1900, is now in public domain and available from Project Gutenberg)

import numpy as np

We first read the text into the data variable and define the set of letters we are interested in:

fname = "supp/wizardofoz_1990_publicdomain_guthenberg.txt"
alphabet_str = "abcdefghijklmnopqrstuvwxyz "

# Convert the alphabet_str to alphabet array
alphabet = [*alphabet_str]
with open(fname, 'r') as myfile:
    data=myfile.read().replace('\n', ' ')
    
# Convert it to all lowercase
data = data.lower()

Letters#

Count and store the frequencies into the count_letter dictionary:

count_letter = {}
for letter in alphabet:
    count_letter[letter] = data.count(letter)
count_letter
{'a': 13887,
 'b': 2448,
 'c': 4161,
 'd': 8888,
 'e': 22261,
 'f': 3588,
 'g': 3587,
 'h': 11977,
 'i': 10350,
 'j': 240,
 'k': 1920,
 'l': 7256,
 'm': 3830,
 'n': 10752,
 'o': 14240,
 'p': 2466,
 'q': 144,
 'r': 10534,
 's': 9689,
 't': 16378,
 'u': 4600,
 'v': 1290,
 'w': 5229,
 'x': 209,
 'y': 4128,
 'z': 277,
 ' ': 44331}

To calculate the probabilities, we divide each frequency by the total sum:

tot_count = np.sum(list(count_letter.values()))
print(tot_count)
218660
probs_letter = {}
for letter in count_letter.keys():
    probs_letter[letter] = count_letter[letter] / tot_count
probs_letter
{'a': 0.06350955821823837,
 'b': 0.0111954632763194,
 'c': 0.019029543583645843,
 'd': 0.04064758071892436,
 'e': 0.1018064575139486,
 'f': 0.016409036860879904,
 'g': 0.01640446355071801,
 'h': 0.05477453580901857,
 'i': 0.04733376017561511,
 'j': 0.0010975944388548432,
 'k': 0.008780755510838746,
 'l': 0.033183938534711424,
 'm': 0.01751577792005854,
 'n': 0.04917223086069697,
 'o': 0.06512393670538735,
 'p': 0.011277782859233513,
 'q': 0.0006585566633129059,
 'r': 0.048175249245403826,
 's': 0.0443108021586024,
 't': 0.07490167383151926,
 'u': 0.021037226744717828,
 'v': 0.005899570108844782,
 'w': 0.023913838836549895,
 'x': 0.0009558218238360926,
 'y': 0.018878624348303303,
 'z': 0.0012668069148449649,
 ' ': 0.20273941278697521}

Bigrams#

We are going to generate all possible bigrams and count them, storing in count_bigram:

count_bigram = {}
total_sum_bigram = 0
for x in alphabet:
    for y in alphabet:
        bigram = x+y
        count_bigram[bigram] = data.count(bigram)
        total_sum_bigram += count_bigram[bigram]
count_bigram
{'aa': 0,
 'ab': 124,
 'ac': 350,
 'ad': 789,
 'ae': 1,
 'af': 143,
 'ag': 264,
 'ah': 12,
 'ai': 790,
 'aj': 2,
 'ak': 156,
 'al': 999,
 'am': 310,
 'an': 3170,
 'ao': 0,
 'ap': 218,
 'aq': 0,
 'ar': 1552,
 'as': 1506,
 'at': 1442,
 'au': 147,
 'av': 343,
 'aw': 223,
 'ax': 36,
 'ay': 377,
 'az': 12,
 'a ': 897,
 'ba': 254,
 'bb': 9,
 'bc': 0,
 'bd': 0,
 'be': 857,
 'bf': 0,
 'bg': 0,
 'bh': 0,
 'bi': 120,
 'bj': 2,
 'bk': 0,
 'bl': 191,
 'bm': 0,
 'bn': 0,
 'bo': 207,
 'bp': 0,
 'bq': 0,
 'br': 243,
 'bs': 7,
 'bt': 6,
 'bu': 393,
 'bv': 0,
 'bw': 0,
 'bx': 0,
 'by': 147,
 'bz': 0,
 'b ': 7,
 'ca': 798,
 'cb': 0,
 'cc': 26,
 'cd': 0,
 'ce': 470,
 'cf': 0,
 'cg': 0,
 'ch': 699,
 'ci': 150,
 'cj': 0,
 'ck': 377,
 'cl': 189,
 'cm': 0,
 'cn': 0,
 'co': 692,
 'cp': 0,
 'cq': 0,
 'cr': 394,
 'cs': 0,
 'ct': 232,
 'cu': 67,
 'cv': 0,
 'cw': 0,
 'cx': 0,
 'cy': 21,
 'cz': 0,
 'c ': 41,
 'da': 199,
 'db': 0,
 'dc': 6,
 'dd': 73,
 'de': 715,
 'df': 12,
 'dg': 19,
 'dh': 1,
 'di': 358,
 'dj': 0,
 'dk': 1,
 'dl': 130,
 'dm': 189,
 'dn': 16,
 'do': 781,
 'dp': 0,
 'dq': 0,
 'dr': 94,
 'ds': 170,
 'dt': 0,
 'du': 29,
 'dv': 7,
 'dw': 5,
 'dx': 0,
 'dy': 57,
 'dz': 0,
 'd ': 5375,
 'ea': 1165,
 'eb': 28,
 'ec': 576,
 'ed': 2124,
 'ee': 806,
 'ef': 184,
 'eg': 88,
 'eh': 27,
 'ei': 175,
 'ej': 2,
 'ek': 17,
 'el': 660,
 'em': 512,
 'en': 1363,
 'eo': 68,
 'ep': 245,
 'eq': 14,
 'er': 2876,
 'es': 1090,
 'et': 472,
 'eu': 5,
 'ev': 279,
 'ew': 147,
 'ex': 105,
 'ey': 568,
 'ez': 1,
 'e ': 7884,
 'fa': 192,
 'fb': 0,
 'fc': 0,
 'fd': 0,
 'fe': 246,
 'ff': 97,
 'fg': 0,
 'fh': 0,
 'fi': 234,
 'fj': 0,
 'fk': 0,
 'fl': 113,
 'fm': 0,
 'fn': 0,
 'fo': 749,
 'fp': 0,
 'fq': 0,
 'fr': 298,
 'fs': 1,
 'ft': 157,
 'fu': 214,
 'fv': 0,
 'fw': 1,
 'fx': 0,
 'fy': 2,
 'fz': 0,
 'f ': 1214,
 'ga': 222,
 'gb': 0,
 'gc': 0,
 'gd': 2,
 'ge': 410,
 'gf': 0,
 'gg': 35,
 'gh': 444,
 'gi': 198,
 'gj': 0,
 'gk': 0,
 'gl': 84,
 'gm': 0,
 'gn': 4,
 'go': 288,
 'gp': 0,
 'gq': 0,
 'gr': 485,
 'gs': 75,
 'gt': 3,
 'gu': 135,
 'gv': 0,
 'gw': 0,
 'gx': 0,
 'gy': 5,
 'gz': 0,
 'g ': 964,
 'ha': 1515,
 'hb': 0,
 'hc': 0,
 'hd': 0,
 'he': 6611,
 'hf': 2,
 'hg': 0,
 'hh': 0,
 'hi': 1157,
 'hj': 0,
 'hk': 34,
 'hl': 5,
 'hm': 0,
 'hn': 0,
 'ho': 688,
 'hp': 0,
 'hq': 0,
 'hr': 142,
 'hs': 21,
 'ht': 299,
 'hu': 69,
 'hv': 0,
 'hw': 2,
 'hx': 0,
 'hy': 416,
 'hz': 0,
 'h ': 846,
 'ia': 59,
 'ib': 89,
 'ic': 423,
 'id': 677,
 'ie': 424,
 'if': 266,
 'ig': 309,
 'ih': 0,
 'ii': 22,
 'ij': 0,
 'ik': 71,
 'il': 624,
 'im': 386,
 'in': 2595,
 'io': 393,
 'ip': 42,
 'iq': 0,
 'ir': 440,
 'is': 1048,
 'it': 1504,
 'iu': 5,
 'iv': 207,
 'iw': 0,
 'ix': 7,
 'iy': 0,
 'iz': 68,
 'i ': 611,
 'ja': 8,
 'jb': 0,
 'jc': 0,
 'jd': 0,
 'je': 97,
 'jf': 0,
 'jg': 0,
 'jh': 0,
 'ji': 0,
 'jj': 0,
 'jk': 0,
 'jl': 0,
 'jm': 0,
 'jn': 0,
 'jo': 75,
 'jp': 0,
 'jq': 0,
 'jr': 0,
 'js': 0,
 'jt': 0,
 'ju': 60,
 'jv': 0,
 'jw': 0,
 'jx': 0,
 'jy': 0,
 'jz': 0,
 'j ': 0,
 'ka': 62,
 'kb': 0,
 'kc': 0,
 'kd': 0,
 'ke': 741,
 'kf': 10,
 'kg': 0,
 'kh': 0,
 'ki': 269,
 'kj': 0,
 'kk': 0,
 'kl': 27,
 'km': 7,
 'kn': 110,
 'ko': 1,
 'kp': 0,
 'kq': 0,
 'kr': 0,
 'ks': 82,
 'kt': 0,
 'ku': 0,
 'kv': 0,
 'kw': 3,
 'kx': 0,
 'ky': 20,
 'kz': 0,
 'k ': 455,
 'la': 425,
 'lb': 2,
 'lc': 6,
 'ld': 637,
 'le': 1083,
 'lf': 90,
 'lg': 2,
 'lh': 0,
 'li': 828,
 'lj': 0,
 'lk': 117,
 'll': 1165,
 'lm': 12,
 'ln': 1,
 'lo': 637,
 'lp': 52,
 'lq': 0,
 'lr': 4,
 'ls': 77,
 'lt': 71,
 'lu': 72,
 'lv': 58,
 'lw': 21,
 'lx': 0,
 'ly': 628,
 'lz': 0,
 'l ': 1049,
 'ma': 770,
 'mb': 60,
 'mc': 0,
 'md': 0,
 'me': 948,
 'mf': 11,
 'mg': 0,
 'mh': 3,
 'mi': 241,
 'mj': 0,
 'mk': 0,
 'ml': 9,
 'mm': 34,
 'mn': 6,
 'mo': 279,
 'mp': 88,
 'mq': 0,
 'mr': 13,
 'ms': 119,
 'mt': 0,
 'mu': 174,
 'mv': 0,
 'mw': 0,
 'mx': 0,
 'my': 221,
 'mz': 0,
 'm ': 654,
 'na': 99,
 'nb': 100,
 'nc': 242,
 'nd': 2476,
 'ne': 816,
 'nf': 23,
 'ng': 1192,
 'nh': 13,
 'ni': 221,
 'nj': 5,
 'nk': 204,
 'nl': 102,
 'nm': 1,
 'nn': 85,
 'no': 752,
 'np': 2,
 'nq': 14,
 'nr': 14,
 'ns': 393,
 'nt': 691,
 'nu': 40,
 'nv': 7,
 'nw': 1,
 'nx': 15,
 'ny': 194,
 'nz': 0,
 'n ': 2485,
 'oa': 108,
 'ob': 35,
 'oc': 67,
 'od': 407,
 'oe': 62,
 'of': 1044,
 'og': 50,
 'oh': 40,
 'oi': 134,
 'oj': 88,
 'ok': 205,
 'ol': 320,
 'om': 580,
 'on': 1549,
 'oo': 808,
 'op': 245,
 'oq': 7,
 'or': 1641,
 'os': 200,
 'ot': 1062,
 'ou': 1952,
 'ov': 196,
 'ow': 979,
 'ox': 6,
 'oy': 43,
 'oz': 171,
 'o ': 2101,
 'pa': 220,
 'pb': 8,
 'pc': 0,
 'pd': 2,
 'pe': 375,
 'pf': 0,
 'pg': 2,
 'ph': 28,
 'pi': 115,
 'pj': 0,
 'pk': 0,
 'pl': 287,
 'pm': 3,
 'pn': 0,
 'po': 281,
 'pp': 166,
 'pq': 0,
 'pr': 337,
 'ps': 35,
 'pt': 117,
 'pu': 77,
 'pv': 0,
 'pw': 1,
 'px': 0,
 'py': 67,
 'pz': 0,
 'p ': 265,
 'qa': 0,
 'qb': 0,
 'qc': 0,
 'qd': 0,
 'qe': 0,
 'qf': 0,
 'qg': 0,
 'qh': 0,
 'qi': 0,
 'qj': 0,
 'qk': 0,
 'ql': 0,
 'qm': 0,
 'qn': 0,
 'qo': 0,
 'qp': 0,
 'qq': 0,
 'qr': 0,
 'qs': 0,
 'qt': 0,
 'qu': 139,
 'qv': 0,
 'qw': 0,
 'qx': 0,
 'qy': 0,
 'qz': 0,
 'q ': 3,
 'ra': 654,
 'rb': 9,
 'rc': 71,
 'rd': 272,
 're': 2572,
 'rf': 57,
 'rg': 137,
 'rh': 10,
 'ri': 608,
 'rj': 0,
 'rk': 199,
 'rl': 165,
 'rm': 181,
 'rn': 200,
 'ro': 1537,
 'rp': 63,
 'rq': 0,
 'rr': 182,
 'rs': 333,
 'rt': 334,
 'ru': 153,
 'rv': 16,
 'rw': 16,
 'rx': 0,
 'ry': 387,
 'rz': 0,
 'r ': 2002,
 'sa': 599,
 'sb': 1,
 'sc': 302,
 'sd': 0,
 'se': 943,
 'sf': 4,
 'sg': 1,
 'sh': 870,
 'si': 281,
 'sj': 0,
 'sk': 197,
 'sl': 96,
 'sm': 58,
 'sn': 25,
 'so': 607,
 'sp': 128,
 'sq': 6,
 'sr': 0,
 'ss': 284,
 'st': 1202,
 'su': 190,
 'sv': 0,
 'sw': 122,
 'sx': 0,
 'sy': 16,
 'sz': 0,
 's ': 3081,
 'ta': 308,
 'tb': 0,
 'tc': 182,
 'td': 1,
 'te': 1125,
 'tf': 13,
 'tg': 0,
 'th': 6453,
 'ti': 721,
 'tj': 0,
 'tk': 0,
 'tl': 272,
 'tm': 3,
 'tn': 4,
 'to': 1762,
 'tp': 0,
 'tq': 0,
 'tr': 497,
 'ts': 178,
 'tt': 304,
 'tu': 159,
 'tv': 0,
 'tw': 44,
 'tx': 0,
 'ty': 187,
 'tz': 0,
 't ': 3602,
 'ua': 50,
 'ub': 26,
 'uc': 128,
 'ud': 62,
 'ue': 117,
 'uf': 34,
 'ug': 250,
 'uh': 0,
 'ui': 108,
 'uj': 0,
 'uk': 0,
 'ul': 547,
 'um': 62,
 'un': 615,
 'uo': 0,
 'up': 248,
 'uq': 0,
 'ur': 503,
 'us': 462,
 'ut': 802,
 'uu': 0,
 'uv': 0,
 'uw': 0,
 'ux': 0,
 'uy': 0,
 'uz': 7,
 'u ': 490,
 'va': 13,
 'vb': 0,
 'vc': 0,
 'vd': 0,
 've': 1105,
 'vf': 0,
 'vg': 0,
 'vh': 0,
 'vi': 108,
 'vj': 0,
 'vk': 0,
 'vl': 0,
 'vm': 0,
 'vn': 0,
 'vo': 45,
 'vp': 0,
 'vq': 0,
 'vr': 0,
 'vs': 0,
 'vt': 0,
 'vu': 0,
 'vv': 0,
 'vw': 0,
 'vx': 0,
 'vy': 9,
 'vz': 0,
 'v ': 5,
 'wa': 980,
 'wb': 0,
 'wc': 0,
 'wd': 5,
 'we': 794,
 'wf': 9,
 'wg': 0,
 'wh': 711,
 'wi': 1031,
 'wj': 0,
 'wk': 2,
 'wl': 24,
 'wm': 0,
 'wn': 163,
 'wo': 616,
 'wp': 0,
 'wq': 0,
 'wr': 19,
 'ws': 50,
 'wt': 0,
 'wu': 2,
 'wv': 0,
 'ww': 9,
 'wx': 0,
 'wy': 0,
 'wz': 0,
 'w ': 588,
 'xa': 8,
 'xb': 0,
 'xc': 29,
 'xd': 0,
 'xe': 36,
 'xf': 0,
 'xg': 0,
 'xh': 1,
 'xi': 36,
 'xj': 0,
 'xk': 0,
 'xl': 0,
 'xm': 0,
 'xn': 0,
 'xo': 0,
 'xp': 24,
 'xq': 0,
 'xr': 0,
 'xs': 0,
 'xt': 37,
 'xu': 0,
 'xv': 8,
 'xw': 0,
 'xx': 10,
 'xy': 0,
 'xz': 0,
 'x ': 12,
 'ya': 16,
 'yb': 3,
 'yc': 16,
 'yd': 0,
 'ye': 182,
 'yf': 4,
 'yg': 0,
 'yh': 0,
 'yi': 61,
 'yj': 0,
 'yk': 0,
 'yl': 3,
 'ym': 4,
 'yn': 1,
 'yo': 703,
 'yp': 3,
 'yq': 0,
 'yr': 20,
 'ys': 101,
 'yt': 41,
 'yu': 0,
 'yv': 0,
 'yw': 12,
 'yx': 0,
 'yy': 0,
 'yz': 0,
 'y ': 2351,
 'za': 47,
 'zb': 0,
 'zc': 0,
 'zd': 0,
 'ze': 29,
 'zf': 0,
 'zg': 0,
 'zh': 0,
 'zi': 6,
 'zj': 0,
 'zk': 0,
 'zl': 6,
 'zm': 0,
 'zn': 0,
 'zo': 0,
 'zp': 0,
 'zq': 0,
 'zr': 0,
 'zs': 0,
 'zt': 0,
 'zu': 2,
 'zv': 0,
 'zw': 0,
 'zx': 0,
 'zy': 7,
 'zz': 8,
 'z ': 93,
 ' a': 5006,
 ' b': 1847,
 ' c': 1691,
 ' d': 1331,
 ' e': 667,
 ' f': 1531,
 ' g': 1208,
 ' h': 2599,
 ' i': 2128,
 ' j': 139,
 ' k': 335,
 ' l': 1203,
 ' m': 1441,
 ' n': 808,
 ' o': 2198,
 ' p': 918,
 ' q': 100,
 ' r': 765,
 ' s': 3324,
 ' t': 7379,
 ' u': 514,
 ' v': 154,
 ' w': 3466,
 ' x': 30,
 ' y': 705,
 ' z': 4,
 '  ': 1401}

Exercise#

Calculate the probabilities:

\[p("b") = ?\]
\[p("an") = ?\]
p_b = probs_letter["b"]
p_b
0.0111954632763194
p_an = count_bigram["an"] / total_sum_bigram
p_an
0.015006982744336875

Exercise#

Calculate the probabilities:

\[p(x="a" | y ="n") =?\]
\[p(y="n"|x="a") =?\]

\(p(x="a"|y="n")\)#

To calculate the first probability, we need to define a subset that contains all the bigrams whose second letter is equal to “n”:

subset_x_yn = {}
for x in alphabet:
    bigram = x + "n"
    subset_x_yn[bigram] = count_bigram[bigram]
subset_x_yn
{'an': 3170,
 'bn': 0,
 'cn': 0,
 'dn': 16,
 'en': 1363,
 'fn': 0,
 'gn': 4,
 'hn': 0,
 'in': 2595,
 'jn': 0,
 'kn': 110,
 'ln': 1,
 'mn': 6,
 'nn': 85,
 'on': 1549,
 'pn': 0,
 'qn': 0,
 'rn': 200,
 'sn': 25,
 'tn': 4,
 'un': 615,
 'vn': 0,
 'wn': 163,
 'xn': 0,
 'yn': 1,
 'zn': 0,
 ' n': 808}

The total of the subset, sum_subset_x_yn is:

subset_x_yn.values()
dict_values([3170, 0, 0, 16, 1363, 0, 4, 0, 2595, 0, 110, 1, 6, 85, 1549, 0, 0, 200, 25, 4, 615, 0, 163, 0, 1, 0, 808])
# We are converting the values to a numpy array
# to directly evaluate using the 'sum()' method:
array_1 = np.array(list(subset_x_yn.values()))
array_1
array([3170,    0,    0,   16, 1363,    0,    4,    0, 2595,    0,  110,
          1,    6,   85, 1549,    0,    0,  200,   25,    4,  615,    0,
        163,    0,    1,    0,  808])
total_subset_x_yn = array_1.sum()
total_subset_x_yn
10715

Thus, \(p(x="a"|y="n")\) probability is:

\[p(x="a"|y="n") = \frac{3170}{10715}\approx0.296\]
prob_xa_yn = subset_x_yn["an"] / total_subset_x_yn
prob_xa_yn
0.2958469435370975

\(p(y="n"|x="a")\)#

For the second probability, we construct a new subset that contains the bigrams whose first letter is “a”:

subset_y_xa = {}
for y in alphabet:
    bigram = "a" + y
    subset_y_xa[bigram] = count_bigram[bigram]
subset_y_xa
{'aa': 0,
 'ab': 124,
 'ac': 350,
 'ad': 789,
 'ae': 1,
 'af': 143,
 'ag': 264,
 'ah': 12,
 'ai': 790,
 'aj': 2,
 'ak': 156,
 'al': 999,
 'am': 310,
 'an': 3170,
 'ao': 0,
 'ap': 218,
 'aq': 0,
 'ar': 1552,
 'as': 1506,
 'at': 1442,
 'au': 147,
 'av': 343,
 'aw': 223,
 'ax': 36,
 'ay': 377,
 'az': 12,
 'a ': 897}

The rest is similar to the first one:

array_2 = np.array(list(subset_y_xa.values()))
array_2
array([   0,  124,  350,  789,    1,  143,  264,   12,  790,    2,  156,
        999,  310, 3170,    0,  218,    0, 1552, 1506, 1442,  147,  343,
        223,   36,  377,   12,  897])
total_subset_y_xa = array_2.sum()
total_subset_y_xa
13863
\[p(y="n"|x="a") = \frac{3172}{13863}\approx0.229\]
prob_yn_xa = subset_y_xa["an"] / total_subset_y_xa
prob_yn_xa
0.22866623385991489

Exercise#

Calculate the probability:

\[\frac{p(y="n"|x="a")p("a")}{p("n")}\]

For this one, we have all the factors:

prob_yn_xa * probs_letter["a"] / probs_letter["n"]
0.29533928474819926

Let’s check if this is equal to \(p(x="a"|y="n")\) as Bayes Theorem dictates:

prob_xa_yn
0.2958469435370975

The two values are close but not equal. I hope that you are able to figure out the reason for this difference. If not, please ponder on it a while before proceeding! 8)


















Wait while the reader ponders!
(Spoilers ahead!)















The reason is due to the characters unaccounted for (e.g., “d.”). Consider the following paragraph:

“My darling child!” she cried, folding the little girl in her arms and covering her face with kisses. “Where in the world did you come from?”

If we were to calculate the marginal probability of \(p("n")\), for example, we would first count all the occurences of the letter “n” and then divide by all the characters included in the text:

text = '''"My darling child!" she cried, folding the little girl in her arms and
covering her face with kisses. "Where in the world did you come from?"'''

text = text.lower()
print(text)
"my darling child!" she cried, folding the little girl in her arms and
covering her face with kisses. "where in the world did you come from?"
count_n = text.count("n")
count_n
6
count_all = len(text)
count_all
141
p_n = count_n / count_all
p_n
0.0425531914893617

However, this marginal probability includes the uncounted characters such as {”’”,”””,”,”,”.”,”!”} which aren’t accounted for when we were calculating the bigram probabilities, thus messing with our calculations. This also goes for the bigram probabilities, thus causing an inconsistency.

To remedy this issue, we should have excluded all the characters except the ones we had in our alphabet before we had started. This filtering can be done via the regular expression module re’s sub() method:

import re
text_filtered = re.sub('[^a-z ]',' ',text)
print(text_filtered)
 my darling child   she cried  folding the little girl in her arms and covering her face with kisses   where in the world did you come from  

Although the regular expressions is a whole topic by itself, to briefly explain, we are first defining a range inside the square brackets to include any letter from a to z ([a-z]), and then also add the space character to this range ([a-z ]). But since these are the characters we want to keep, we negate our statement by putting the negation sign (“^”) to mean “every character that is not in this range” ([^a-z ]). The second parameter is the replacement, by putting in ” “, we are saying that replace all the matching characters with space. The third parameter is the text we want to operate on.

Thus, we end up with a text that only contains the characters we are taking into account.

Below, we’ll repeat the same procedures we did above, only this time we’ll filter our text to only include the letters from a to z and the space character:

with open(fname, 'r') as myfile:
    data=myfile.read().replace('\n', ' ')

# Convert it to all lowercase
data = data.lower()

data = re.sub('[^a-z ]',' ',data)

count_letter = {}
for letter in alphabet:
    count_letter[letter] = data.count(letter)
    
tot_count = np.sum(list(count_letter.values()))

probs_letter = {}
for letter in count_letter.keys():
    probs_letter[letter] = count_letter[letter] / tot_count

count_bigram = {}
total_sum_bigram = 0
for x in alphabet:
    for y in alphabet:
        bigram = x+y
        count_bigram[bigram] = data.count(bigram)
        total_sum_bigram += count_bigram[bigram]

p_b = probs_letter["b"]

p_an = count_bigram["an"] / total_sum_bigram

subset_x_yn = {}
for x in alphabet:
    bigram = x + "n"
    subset_x_yn[bigram] = count_bigram[bigram]
    
array_1 = np.array(list(subset_x_yn.values()))

total_subset_x_yn = array_1.sum()

prob_xa_yn = subset_x_yn["an"] / total_subset_x_yn

subset_y_xa = {}
for y in alphabet:
    bigram = "a" + y
    subset_y_xa[bigram] = count_bigram[bigram]

array_2 = np.array(list(subset_y_xa.values()))

total_subset_y_xa = array_2.sum()

prob_yn_xa = subset_y_xa["an"] / total_subset_y_xa

prob_yn_xa * probs_letter["a"] / probs_letter["n"]
0.29482886904761907
prob_xa_yn
0.29482886904761907

Thus, we have saved the Bayes Theorem! 8)

\[p(a|b) = \frac{p(b|a)p(a)}{p(b)}\]

Visualizing the frequencies#

Using the pandas module, we can also “visualize” the frequencies:

import pandas as pd
df_letter = pd.DataFrame(count_letter.values(),
                         index=alphabet,
                         columns=["count"])
df_letter.style.background_gradient(cmap="binary")

#https://matplotlib.org/stable/tutorials/colors/colormaps.html
# https://stackoverflow.com/a/50605020
  count
a 13887
b 2448
c 4161
d 8888
e 22261
f 3588
g 3587
h 11977
i 10350
j 240
k 1920
l 7256
m 3830
n 10752
o 14240
p 2466
q 144
r 10534
s 9689
t 16378
u 4600
v 1290
w 5229
x 209
y 4128
z 277
52771
df = pd.DataFrame({"l1":[],"l2":[],"count":[]})
for letter1 in alphabet:
    for letter2 in alphabet:
        word = letter1+letter2
        df.loc[-1] = [letter1,letter2,data.count(word)]
        df.index = df.index + 1
df.insert(loc=2,column="word",value=df["l1"]+df["l2"])
df
l1 l2 word count
728 a a aa 0
727 a b ab 124
726 a c ac 350
725 a d ad 789
724 a e ae 1
... ... ... ... ...
4 w w 3634
3 x x 30
2 y y 756
1 z z 10
0 6884

729 rows × 4 columns

len_alpha = len(alphabet)

mat = np.zeros((len_alpha,len_alpha),int)
for i in range(len_alpha):
    for j in range(len_alpha):
        mat[i,j] = df.loc[df.word==(alphabet[i]+alphabet[j]),"count"]
        
df_bi = pd.DataFrame(mat,index=alphabet,columns=alphabet)

import matplotlib.pyplot as plt
from matplotlib import colors

def background_gradient(s, m, M, cmap='PuBu', low=0, high=0):
    rng = M - m
    norm = colors.Normalize(m - (rng * low),
                            M + (rng * high))
    normed = norm(s.values)
    c = [colors.rgb2hex(x) for x in plt.cm.get_cmap(cmap)(normed)]
    return ['background-color: %s' % color for color in c]


df_bi.style.apply(background_gradient,cmap="binary",
               m=df_bi.min().min(),
               M=df_bi.max().max(),
               low=0,
               high=0.85)

# https://stackoverflow.com/a/42563850
  a b c d e f g h i j k l m n o p q r s t u v w x y z
a 0 124 350 789 1 143 264 12 790 2 156 999 310 3170 0 218 0 1552 1506 1442 147 343 223 36 377 12 921
b 254 9 0 0 857 0 0 0 120 2 0 191 0 0 207 0 0 243 7 6 393 0 0 0 147 0 12
c 798 0 26 0 470 0 0 699 150 0 377 189 0 0 692 0 0 394 0 232 67 0 0 0 21 0 46
d 199 0 6 73 715 12 19 1 358 0 1 130 189 16 781 0 0 94 170 0 29 7 5 0 57 0 6026
e 1165 28 576 2124 806 184 88 27 175 2 17 660 512 1363 68 245 14 2876 1090 472 5 279 147 105 568 1 8664
f 192 0 0 0 246 97 0 0 234 0 0 113 0 0 749 0 0 298 1 157 214 0 1 0 2 0 1284
g 222 0 0 2 410 0 35 444 198 0 0 84 0 4 288 0 0 485 75 3 135 0 0 0 5 0 1197
h 1515 0 0 0 6611 2 0 0 1157 0 34 5 0 0 688 0 0 142 21 299 69 0 2 0 416 0 1016
i 59 89 423 677 424 266 309 0 22 0 71 624 386 2595 393 42 0 440 1048 1504 5 207 0 7 0 68 681
j 8 0 0 0 97 0 0 0 0 0 0 0 0 0 75 0 0 0 0 0 60 0 0 0 0 0 0
k 62 0 0 0 741 10 0 0 269 0 0 27 7 110 1 0 0 0 82 0 0 0 3 0 20 0 588
l 425 2 6 637 1083 90 2 0 828 0 117 1165 12 1 637 52 0 4 77 71 72 58 21 0 628 0 1268
m 770 60 0 0 948 11 0 3 241 0 0 9 34 6 279 88 0 13 119 0 174 0 0 0 221 0 854
n 99 100 242 2476 816 23 1192 13 221 5 204 102 1 85 752 2 14 14 393 691 40 7 1 15 194 0 3050
o 108 35 67 407 62 1044 50 40 134 88 205 320 580 1549 808 245 7 1641 200 1062 1952 196 979 6 43 171 2241
p 220 8 0 2 375 0 2 28 115 0 0 287 3 0 281 166 0 337 35 117 77 0 1 0 67 0 345
q 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 139 0 0 0 0 0 5
r 654 9 71 272 2572 57 137 10 608 0 199 165 181 200 1537 63 0 182 333 334 153 16 16 0 387 0 2378
s 599 1 302 0 943 4 1 870 281 0 197 96 58 25 607 128 6 0 284 1202 190 0 122 0 16 0 3757
t 308 0 182 1 1125 13 0 6453 721 0 0 272 3 4 1762 0 0 497 178 304 159 0 44 0 187 0 4165
u 50 26 128 62 117 34 250 0 108 0 0 547 62 615 0 248 0 503 462 802 0 0 0 0 0 7 579
v 13 0 0 0 1105 0 0 0 108 0 0 0 0 0 45 0 0 0 0 0 0 0 0 0 9 0 10
w 980 0 0 5 794 9 0 711 1031 0 2 24 0 163 616 0 0 19 50 0 2 0 9 0 0 0 805
x 8 0 29 0 36 0 0 1 36 0 0 0 0 0 0 24 0 0 0 37 0 8 0 10 0 0 20
y 16 3 16 0 182 4 0 0 61 0 0 3 4 1 703 3 0 20 101 41 0 0 12 0 0 0 2958
z 47 0 0 0 29 0 0 0 6 0 0 6 0 0 0 0 0 0 0 0 2 0 0 0 7 8 172
5116 1954 1737 1361 696 1585 1238 2665 2368 141 340 1238 1488 845 2271 942 103 780 3457 7602 516 169 3634 30 756 10 6884