Probabilities Application: Letter Frequencies

Probabilities Application: Letter Frequencies#

FIZ371 - Scientific & Technical Calculations | 11/10/2023

Emre S. Tasci emre.tasci@hacettepe.edu.tr

Using L. Frank Baum’s “The Wonderful Wizard of Oz” book, calculate the frequencies of the letters & bigrams.

(The book, written in 1900, is now in public domain and available from Project Gutenberg)

import numpy as np

We first read the text into the data variable and define the set of letters we are interested in:

fname = "supp/wizardofoz_1990_publicdomain_guthenberg.txt"
alphabet_str = "abcdefghijklmnopqrstuvwxyz "

# Convert the alphabet_str to alphabet array
alphabet = [*alphabet_str]

with open(fname, 'r') as myfile:
    data=myfile.read().replace('\n', ' ')
    
# Convert it to all lowercase
data = data.lower()

Letters#

Count and store the frequencies into the count_letter dictionary:

count_letter = {}
for letter in alphabet:
    count_letter[letter] = data.count(letter)
count_letter

{'a': 13887,
 'b': 2448,
 'c': 4161,
 'd': 8888,
 'e': 22261,
 'f': 3588,
 'g': 3587,
 'h': 11977,
 'i': 10350,
 'j': 240,
 'k': 1920,
 'l': 7256,
 'm': 3830,
 'n': 10752,
 'o': 14240,
 'p': 2466,
 'q': 144,
 'r': 10534,
 's': 9689,
 't': 16378,
 'u': 4600,
 'v': 1290,
 'w': 5229,
 'x': 209,
 'y': 4128,
 'z': 277,
 ' ': 44331}

To calculate the probabilities, we divide each frequency by the total sum:

tot_count = np.sum(list(count_letter.values()))
print(tot_count)

probs_letter = {}
for letter in count_letter.keys():
    probs_letter[letter] = count_letter[letter] / tot_count
probs_letter

{'a': 0.06350955821823837,
 'b': 0.0111954632763194,
 'c': 0.019029543583645843,
 'd': 0.04064758071892436,
 'e': 0.1018064575139486,
 'f': 0.016409036860879904,
 'g': 0.01640446355071801,
 'h': 0.05477453580901857,
 'i': 0.04733376017561511,
 'j': 0.0010975944388548432,
 'k': 0.008780755510838746,
 'l': 0.033183938534711424,
 'm': 0.01751577792005854,
 'n': 0.04917223086069697,
 'o': 0.06512393670538735,
 'p': 0.011277782859233513,
 'q': 0.0006585566633129059,
 'r': 0.048175249245403826,
 's': 0.0443108021586024,
 't': 0.07490167383151926,
 'u': 0.021037226744717828,
 'v': 0.005899570108844782,
 'w': 0.023913838836549895,
 'x': 0.0009558218238360926,
 'y': 0.018878624348303303,
 'z': 0.0012668069148449649,
 ' ': 0.20273941278697521}

Bigrams#

We are going to generate all possible bigrams and count them, storing in count_bigram:

count_bigram = {}
total_sum_bigram = 0
for x in alphabet:
    for y in alphabet:
        bigram = x+y
        count_bigram[bigram] = data.count(bigram)
        total_sum_bigram += count_bigram[bigram]

count_bigram

{'aa': 0,
 'ab': 124,
 'ac': 350,
 'ad': 789,
 'ae': 1,
 'af': 143,
 'ag': 264,
 'ah': 12,
 'ai': 790,
 'aj': 2,
 'ak': 156,
 'al': 999,
 'am': 310,
 'an': 3170,
 'ao': 0,
 'ap': 218,
 'aq': 0,
 'ar': 1552,
 'as': 1506,
 'at': 1442,
 'au': 147,
 'av': 343,
 'aw': 223,
 'ax': 36,
 'ay': 377,
 'az': 12,
 'a ': 897,
 'ba': 254,
 'bb': 9,
 'bc': 0,
 'bd': 0,
 'be': 857,
 'bf': 0,
 'bg': 0,
 'bh': 0,
 'bi': 120,
 'bj': 2,
 'bk': 0,
 'bl': 191,
 'bm': 0,
 'bn': 0,
 'bo': 207,
 'bp': 0,
 'bq': 0,
 'br': 243,
 'bs': 7,
 'bt': 6,
 'bu': 393,
 'bv': 0,
 'bw': 0,
 'bx': 0,
 'by': 147,
 'bz': 0,
 'b ': 7,
 'ca': 798,
 'cb': 0,
 'cc': 26,
 'cd': 0,
 'ce': 470,
 'cf': 0,
 'cg': 0,
 'ch': 699,
 'ci': 150,
 'cj': 0,
 'ck': 377,
 'cl': 189,
 'cm': 0,
 'cn': 0,
 'co': 692,
 'cp': 0,
 'cq': 0,
 'cr': 394,
 'cs': 0,
 'ct': 232,
 'cu': 67,
 'cv': 0,
 'cw': 0,
 'cx': 0,
 'cy': 21,
 'cz': 0,
 'c ': 41,
 'da': 199,
 'db': 0,
 'dc': 6,
 'dd': 73,
 'de': 715,
 'df': 12,
 'dg': 19,
 'dh': 1,
 'di': 358,
 'dj': 0,
 'dk': 1,
 'dl': 130,
 'dm': 189,
 'dn': 16,
 'do': 781,
 'dp': 0,
 'dq': 0,
 'dr': 94,
 'ds': 170,
 'dt': 0,
 'du': 29,
 'dv': 7,
 'dw': 5,
 'dx': 0,
 'dy': 57,
 'dz': 0,
 'd ': 5375,
 'ea': 1165,
 'eb': 28,
 'ec': 576,
 'ed': 2124,
 'ee': 806,
 'ef': 184,
 'eg': 88,
 'eh': 27,
 'ei': 175,
 'ej': 2,
 'ek': 17,
 'el': 660,
 'em': 512,
 'en': 1363,
 'eo': 68,
 'ep': 245,
 'eq': 14,
 'er': 2876,
 'es': 1090,
 'et': 472,
 'eu': 5,
 'ev': 279,
 'ew': 147,
 'ex': 105,
 'ey': 568,
 'ez': 1,
 'e ': 7884,
 'fa': 192,
 'fb': 0,
 'fc': 0,
 'fd': 0,
 'fe': 246,
 'ff': 97,
 'fg': 0,
 'fh': 0,
 'fi': 234,
 'fj': 0,
 'fk': 0,
 'fl': 113,
 'fm': 0,
 'fn': 0,
 'fo': 749,
 'fp': 0,
 'fq': 0,
 'fr': 298,
 'fs': 1,
 'ft': 157,
 'fu': 214,
 'fv': 0,
 'fw': 1,
 'fx': 0,
 'fy': 2,
 'fz': 0,
 'f ': 1214,
 'ga': 222,
 'gb': 0,
 'gc': 0,
 'gd': 2,
 'ge': 410,
 'gf': 0,
 'gg': 35,
 'gh': 444,
 'gi': 198,
 'gj': 0,
 'gk': 0,
 'gl': 84,
 'gm': 0,
 'gn': 4,
 'go': 288,
 'gp': 0,
 'gq': 0,
 'gr': 485,
 'gs': 75,
 'gt': 3,
 'gu': 135,
 'gv': 0,
 'gw': 0,
 'gx': 0,
 'gy': 5,
 'gz': 0,
 'g ': 964,
 'ha': 1515,
 'hb': 0,
 'hc': 0,
 'hd': 0,
 'he': 6611,
 'hf': 2,
 'hg': 0,
 'hh': 0,
 'hi': 1157,
 'hj': 0,
 'hk': 34,
 'hl': 5,
 'hm': 0,
 'hn': 0,
 'ho': 688,
 'hp': 0,
 'hq': 0,
 'hr': 142,
 'hs': 21,
 'ht': 299,
 'hu': 69,
 'hv': 0,
 'hw': 2,
 'hx': 0,
 'hy': 416,
 'hz': 0,
 'h ': 846,
 'ia': 59,
 'ib': 89,
 'ic': 423,
 'id': 677,
 'ie': 424,
 'if': 266,
 'ig': 309,
 'ih': 0,
 'ii': 22,
 'ij': 0,
 'ik': 71,
 'il': 624,
 'im': 386,
 'in': 2595,
 'io': 393,
 'ip': 42,
 'iq': 0,
 'ir': 440,
 'is': 1048,
 'it': 1504,
 'iu': 5,
 'iv': 207,
 'iw': 0,
 'ix': 7,
 'iy': 0,
 'iz': 68,
 'i ': 611,
 'ja': 8,
 'jb': 0,
 'jc': 0,
 'jd': 0,
 'je': 97,
 'jf': 0,
 'jg': 0,
 'jh': 0,
 'ji': 0,
 'jj': 0,
 'jk': 0,
 'jl': 0,
 'jm': 0,
 'jn': 0,
 'jo': 75,
 'jp': 0,
 'jq': 0,
 'jr': 0,
 'js': 0,
 'jt': 0,
 'ju': 60,
 'jv': 0,
 'jw': 0,
 'jx': 0,
 'jy': 0,
 'jz': 0,
 'j ': 0,
 'ka': 62,
 'kb': 0,
 'kc': 0,
 'kd': 0,
 'ke': 741,
 'kf': 10,
 'kg': 0,
 'kh': 0,
 'ki': 269,
 'kj': 0,
 'kk': 0,
 'kl': 27,
 'km': 7,
 'kn': 110,
 'ko': 1,
 'kp': 0,
 'kq': 0,
 'kr': 0,
 'ks': 82,
 'kt': 0,
 'ku': 0,
 'kv': 0,
 'kw': 3,
 'kx': 0,
 'ky': 20,
 'kz': 0,
 'k ': 455,
 'la': 425,
 'lb': 2,
 'lc': 6,
 'ld': 637,
 'le': 1083,
 'lf': 90,
 'lg': 2,
 'lh': 0,
 'li': 828,
 'lj': 0,
 'lk': 117,
 'll': 1165,
 'lm': 12,
 'ln': 1,
 'lo': 637,
 'lp': 52,
 'lq': 0,
 'lr': 4,
 'ls': 77,
 'lt': 71,
 'lu': 72,
 'lv': 58,
 'lw': 21,
 'lx': 0,
 'ly': 628,
 'lz': 0,
 'l ': 1049,
 'ma': 770,
 'mb': 60,
 'mc': 0,
 'md': 0,
 'me': 948,
 'mf': 11,
 'mg': 0,
 'mh': 3,
 'mi': 241,
 'mj': 0,
 'mk': 0,
 'ml': 9,
 'mm': 34,
 'mn': 6,
 'mo': 279,
 'mp': 88,
 'mq': 0,
 'mr': 13,
 'ms': 119,
 'mt': 0,
 'mu': 174,
 'mv': 0,
 'mw': 0,
 'mx': 0,
 'my': 221,
 'mz': 0,
 'm ': 654,
 'na': 99,
 'nb': 100,
 'nc': 242,
 'nd': 2476,
 'ne': 816,
 'nf': 23,
 'ng': 1192,
 'nh': 13,
 'ni': 221,
 'nj': 5,
 'nk': 204,
 'nl': 102,
 'nm': 1,
 'nn': 85,
 'no': 752,
 'np': 2,
 'nq': 14,
 'nr': 14,
 'ns': 393,
 'nt': 691,
 'nu': 40,
 'nv': 7,
 'nw': 1,
 'nx': 15,
 'ny': 194,
 'nz': 0,
 'n ': 2485,
 'oa': 108,
 'ob': 35,
 'oc': 67,
 'od': 407,
 'oe': 62,
 'of': 1044,
 'og': 50,
 'oh': 40,
 'oi': 134,
 'oj': 88,
 'ok': 205,
 'ol': 320,
 'om': 580,
 'on': 1549,
 'oo': 808,
 'op': 245,
 'oq': 7,
 'or': 1641,
 'os': 200,
 'ot': 1062,
 'ou': 1952,
 'ov': 196,
 'ow': 979,
 'ox': 6,
 'oy': 43,
 'oz': 171,
 'o ': 2101,
 'pa': 220,
 'pb': 8,
 'pc': 0,
 'pd': 2,
 'pe': 375,
 'pf': 0,
 'pg': 2,
 'ph': 28,
 'pi': 115,
 'pj': 0,
 'pk': 0,
 'pl': 287,
 'pm': 3,
 'pn': 0,
 'po': 281,
 'pp': 166,
 'pq': 0,
 'pr': 337,
 'ps': 35,
 'pt': 117,
 'pu': 77,
 'pv': 0,
 'pw': 1,
 'px': 0,
 'py': 67,
 'pz': 0,
 'p ': 265,
 'qa': 0,
 'qb': 0,
 'qc': 0,
 'qd': 0,
 'qe': 0,
 'qf': 0,
 'qg': 0,
 'qh': 0,
 'qi': 0,
 'qj': 0,
 'qk': 0,
 'ql': 0,
 'qm': 0,
 'qn': 0,
 'qo': 0,
 'qp': 0,
 'qq': 0,
 'qr': 0,
 'qs': 0,
 'qt': 0,
 'qu': 139,
 'qv': 0,
 'qw': 0,
 'qx': 0,
 'qy': 0,
 'qz': 0,
 'q ': 3,
 'ra': 654,
 'rb': 9,
 'rc': 71,
 'rd': 272,
 're': 2572,
 'rf': 57,
 'rg': 137,
 'rh': 10,
 'ri': 608,
 'rj': 0,
 'rk': 199,
 'rl': 165,
 'rm': 181,
 'rn': 200,
 'ro': 1537,
 'rp': 63,
 'rq': 0,
 'rr': 182,
 'rs': 333,
 'rt': 334,
 'ru': 153,
 'rv': 16,
 'rw': 16,
 'rx': 0,
 'ry': 387,
 'rz': 0,
 'r ': 2002,
 'sa': 599,
 'sb': 1,
 'sc': 302,
 'sd': 0,
 'se': 943,
 'sf': 4,
 'sg': 1,
 'sh': 870,
 'si': 281,
 'sj': 0,
 'sk': 197,
 'sl': 96,
 'sm': 58,
 'sn': 25,
 'so': 607,
 'sp': 128,
 'sq': 6,
 'sr': 0,
 'ss': 284,
 'st': 1202,
 'su': 190,
 'sv': 0,
 'sw': 122,
 'sx': 0,
 'sy': 16,
 'sz': 0,
 's ': 3081,
 'ta': 308,
 'tb': 0,
 'tc': 182,
 'td': 1,
 'te': 1125,
 'tf': 13,
 'tg': 0,
 'th': 6453,
 'ti': 721,
 'tj': 0,
 'tk': 0,
 'tl': 272,
 'tm': 3,
 'tn': 4,
 'to': 1762,
 'tp': 0,
 'tq': 0,
 'tr': 497,
 'ts': 178,
 'tt': 304,
 'tu': 159,
 'tv': 0,
 'tw': 44,
 'tx': 0,
 'ty': 187,
 'tz': 0,
 't ': 3602,
 'ua': 50,
 'ub': 26,
 'uc': 128,
 'ud': 62,
 'ue': 117,
 'uf': 34,
 'ug': 250,
 'uh': 0,
 'ui': 108,
 'uj': 0,
 'uk': 0,
 'ul': 547,
 'um': 62,
 'un': 615,
 'uo': 0,
 'up': 248,
 'uq': 0,
 'ur': 503,
 'us': 462,
 'ut': 802,
 'uu': 0,
 'uv': 0,
 'uw': 0,
 'ux': 0,
 'uy': 0,
 'uz': 7,
 'u ': 490,
 'va': 13,
 'vb': 0,
 'vc': 0,
 'vd': 0,
 've': 1105,
 'vf': 0,
 'vg': 0,
 'vh': 0,
 'vi': 108,
 'vj': 0,
 'vk': 0,
 'vl': 0,
 'vm': 0,
 'vn': 0,
 'vo': 45,
 'vp': 0,
 'vq': 0,
 'vr': 0,
 'vs': 0,
 'vt': 0,
 'vu': 0,
 'vv': 0,
 'vw': 0,
 'vx': 0,
 'vy': 9,
 'vz': 0,
 'v ': 5,
 'wa': 980,
 'wb': 0,
 'wc': 0,
 'wd': 5,
 'we': 794,
 'wf': 9,
 'wg': 0,
 'wh': 711,
 'wi': 1031,
 'wj': 0,
 'wk': 2,
 'wl': 24,
 'wm': 0,
 'wn': 163,
 'wo': 616,
 'wp': 0,
 'wq': 0,
 'wr': 19,
 'ws': 50,
 'wt': 0,
 'wu': 2,
 'wv': 0,
 'ww': 9,
 'wx': 0,
 'wy': 0,
 'wz': 0,
 'w ': 588,
 'xa': 8,
 'xb': 0,
 'xc': 29,
 'xd': 0,
 'xe': 36,
 'xf': 0,
 'xg': 0,
 'xh': 1,
 'xi': 36,
 'xj': 0,
 'xk': 0,
 'xl': 0,
 'xm': 0,
 'xn': 0,
 'xo': 0,
 'xp': 24,
 'xq': 0,
 'xr': 0,
 'xs': 0,
 'xt': 37,
 'xu': 0,
 'xv': 8,
 'xw': 0,
 'xx': 10,
 'xy': 0,
 'xz': 0,
 'x ': 12,
 'ya': 16,
 'yb': 3,
 'yc': 16,
 'yd': 0,
 'ye': 182,
 'yf': 4,
 'yg': 0,
 'yh': 0,
 'yi': 61,
 'yj': 0,
 'yk': 0,
 'yl': 3,
 'ym': 4,
 'yn': 1,
 'yo': 703,
 'yp': 3,
 'yq': 0,
 'yr': 20,
 'ys': 101,
 'yt': 41,
 'yu': 0,
 'yv': 0,
 'yw': 12,
 'yx': 0,
 'yy': 0,
 'yz': 0,
 'y ': 2351,
 'za': 47,
 'zb': 0,
 'zc': 0,
 'zd': 0,
 'ze': 29,
 'zf': 0,
 'zg': 0,
 'zh': 0,
 'zi': 6,
 'zj': 0,
 'zk': 0,
 'zl': 6,
 'zm': 0,
 'zn': 0,
 'zo': 0,
 'zp': 0,
 'zq': 0,
 'zr': 0,
 'zs': 0,
 'zt': 0,
 'zu': 2,
 'zv': 0,
 'zw': 0,
 'zx': 0,
 'zy': 7,
 'zz': 8,
 'z ': 93,
 ' a': 5006,
 ' b': 1847,
 ' c': 1691,
 ' d': 1331,
 ' e': 667,
 ' f': 1531,
 ' g': 1208,
 ' h': 2599,
 ' i': 2128,
 ' j': 139,
 ' k': 335,
 ' l': 1203,
 ' m': 1441,
 ' n': 808,
 ' o': 2198,
 ' p': 918,
 ' q': 100,
 ' r': 765,
 ' s': 3324,
 ' t': 7379,
 ' u': 514,
 ' v': 154,
 ' w': 3466,
 ' x': 30,
 ' y': 705,
 ' z': 4,
 '  ': 1401}

Exercise#

Calculate the probabilities:

\[p("b") = ?\]

\[p("an") = ?\]

p_b = probs_letter["b"]
p_b

0.0111954632763194

p_an = count_bigram["an"] / total_sum_bigram
p_an

0.015006982744336875

Exercise#

Calculate the probabilities:

\[p(x="a" | y ="n") =?\]

\[p(y="n"|x="a") =?\]

\(p(x="a"|y="n")\)#

To calculate the first probability, we need to define a subset that contains all the bigrams whose second letter is equal to “n”:

subset_x_yn = {}
for x in alphabet:
    bigram = x + "n"
    subset_x_yn[bigram] = count_bigram[bigram]
subset_x_yn

{'an': 3170,
 'bn': 0,
 'cn': 0,
 'dn': 16,
 'en': 1363,
 'fn': 0,
 'gn': 4,
 'hn': 0,
 'in': 2595,
 'jn': 0,
 'kn': 110,
 'ln': 1,
 'mn': 6,
 'nn': 85,
 'on': 1549,
 'pn': 0,
 'qn': 0,
 'rn': 200,
 'sn': 25,
 'tn': 4,
 'un': 615,
 'vn': 0,
 'wn': 163,
 'xn': 0,
 'yn': 1,
 'zn': 0,
 ' n': 808}

The total of the subset, sum_subset_x_yn is:

subset_x_yn.values()

dict_values([3170, 0, 0, 16, 1363, 0, 4, 0, 2595, 0, 110, 1, 6, 85, 1549, 0, 0, 200, 25, 4, 615, 0, 163, 0, 1, 0, 808])

# We are converting the values to a numpy array
# to directly evaluate using the 'sum()' method:
array_1 = np.array(list(subset_x_yn.values()))
array_1

array([3170,    0,    0,   16, 1363,    0,    4,    0, 2595,    0,  110,
          1,    6,   85, 1549,    0,    0,  200,   25,    4,  615,    0,
        163,    0,    1,    0,  808])

total_subset_x_yn = array_1.sum()
total_subset_x_yn

Thus, \(p(x="a"|y="n")\) probability is:

\[p(x="a"|y="n") = \frac{3170}{10715}\approx0.296\]

prob_xa_yn = subset_x_yn["an"] / total_subset_x_yn
prob_xa_yn

0.2958469435370975

\(p(y="n"|x="a")\)#

For the second probability, we construct a new subset that contains the bigrams whose first letter is “a”:

subset_y_xa = {}
for y in alphabet:
    bigram = "a" + y
    subset_y_xa[bigram] = count_bigram[bigram]
subset_y_xa

{'aa': 0,
 'ab': 124,
 'ac': 350,
 'ad': 789,
 'ae': 1,
 'af': 143,
 'ag': 264,
 'ah': 12,
 'ai': 790,
 'aj': 2,
 'ak': 156,
 'al': 999,
 'am': 310,
 'an': 3170,
 'ao': 0,
 'ap': 218,
 'aq': 0,
 'ar': 1552,
 'as': 1506,
 'at': 1442,
 'au': 147,
 'av': 343,
 'aw': 223,
 'ax': 36,
 'ay': 377,
 'az': 12,
 'a ': 897}

The rest is similar to the first one:

array_2 = np.array(list(subset_y_xa.values()))
array_2

array([   0,  124,  350,  789,    1,  143,  264,   12,  790,    2,  156,
        999,  310, 3170,    0,  218,    0, 1552, 1506, 1442,  147,  343,
        223,   36,  377,   12,  897])

total_subset_y_xa = array_2.sum()
total_subset_y_xa

\[p(y="n"|x="a") = \frac{3172}{13863}\approx0.229\]

prob_yn_xa = subset_y_xa["an"] / total_subset_y_xa
prob_yn_xa

0.22866623385991489

Exercise#

Calculate the probability:

\[\frac{p(y="n"|x="a")p("a")}{p("n")}\]

For this one, we have all the factors:

prob_yn_xa * probs_letter["a"] / probs_letter["n"]

0.29533928474819926

Let’s check if this is equal to \(p(x="a"|y="n")\) as Bayes Theorem dictates:

prob_xa_yn

0.2958469435370975

The two values are close but not equal. I hope that you are able to figure out the reason for this difference. If not, please ponder on it a while before proceeding! 8)

Wait while the reader ponders!
(Spoilers ahead!)

The reason is due to the characters unaccounted for (e.g., “d.”). Consider the following paragraph:

“My darling child!” she cried, folding the little girl in her arms and covering her face with kisses. “Where in the world did you come from?”

If we were to calculate the marginal probability of \(p("n")\), for example, we would first count all the occurences of the letter “n” and then divide by all the characters included in the text:

text = '''"My darling child!" she cried, folding the little girl in her arms and
covering her face with kisses. "Where in the world did you come from?"'''

text = text.lower()
print(text)

"my darling child!" she cried, folding the little girl in her arms and
covering her face with kisses. "where in the world did you come from?"

count_n = text.count("n")
count_n

count_all = len(text)
count_all

p_n = count_n / count_all
p_n

0.0425531914893617

However, this marginal probability includes the uncounted characters such as {”’”,”””,”,”,”.”,”!”} which aren’t accounted for when we were calculating the bigram probabilities, thus messing with our calculations. This also goes for the bigram probabilities, thus causing an inconsistency.

To remedy this issue, we should have excluded all the characters except the ones we had in our alphabet before we had started. This filtering can be done via the regular expression module re’s sub() method:

import re

text_filtered = re.sub('[^a-z ]',' ',text)
print(text_filtered)

 my darling child   she cried  folding the little girl in her arms and covering her face with kisses   where in the world did you come from  

Although the regular expressions is a whole topic by itself, to briefly explain, we are first defining a range inside the square brackets to include any letter from a to z ([a-z]), and then also add the space character to this range ([a-z ]). But since these are the characters we want to keep, we negate our statement by putting the negation sign (“^”) to mean “every character that is not in this range” ([^a-z ]). The second parameter is the replacement, by putting in ” “, we are saying that replace all the matching characters with space. The third parameter is the text we want to operate on.

Thus, we end up with a text that only contains the characters we are taking into account.

Below, we’ll repeat the same procedures we did above, only this time we’ll filter our text to only include the letters from a to z and the space character:

with open(fname, 'r') as myfile:
    data=myfile.read().replace('\n', ' ')

# Convert it to all lowercase
data = data.lower()

data = re.sub('[^a-z ]',' ',data)

count_letter = {}
for letter in alphabet:
    count_letter[letter] = data.count(letter)
    
tot_count = np.sum(list(count_letter.values()))

probs_letter = {}
for letter in count_letter.keys():
    probs_letter[letter] = count_letter[letter] / tot_count

count_bigram = {}
total_sum_bigram = 0
for x in alphabet:
    for y in alphabet:
        bigram = x+y
        count_bigram[bigram] = data.count(bigram)
        total_sum_bigram += count_bigram[bigram]

p_b = probs_letter["b"]

p_an = count_bigram["an"] / total_sum_bigram

subset_x_yn = {}
for x in alphabet:
    bigram = x + "n"
    subset_x_yn[bigram] = count_bigram[bigram]
    
array_1 = np.array(list(subset_x_yn.values()))

total_subset_x_yn = array_1.sum()

prob_xa_yn = subset_x_yn["an"] / total_subset_x_yn

subset_y_xa = {}
for y in alphabet:
    bigram = "a" + y
    subset_y_xa[bigram] = count_bigram[bigram]

array_2 = np.array(list(subset_y_xa.values()))

total_subset_y_xa = array_2.sum()

prob_yn_xa = subset_y_xa["an"] / total_subset_y_xa

prob_yn_xa * probs_letter["a"] / probs_letter["n"]

0.29482886904761907

prob_xa_yn

0.29482886904761907

Thus, we have saved the Bayes Theorem! 8)

\[p(a|b) = \frac{p(b|a)p(a)}{p(b)}\]

Visualizing the frequencies#

Using the pandas module, we can also “visualize” the frequencies:

import pandas as pd

df_letter = pd.DataFrame(count_letter.values(),
                         index=alphabet,
                         columns=["count"])
df_letter.style.background_gradient(cmap="binary")

#https://matplotlib.org/stable/tutorials/colors/colormaps.html
# https://stackoverflow.com/a/50605020

	count
a	13887
b	2448
c	4161
d	8888
e	22261
f	3588
g	3587
h	11977
i	10350
j	240
k	1920
l	7256
m	3830
n	10752
o	14240
p	2466
q	144
r	10534
s	9689
t	16378
u	4600
v	1290
w	5229
x	209
y	4128
z	277
	52771

df = pd.DataFrame({"l1":[],"l2":[],"count":[]})
for letter1 in alphabet:
    for letter2 in alphabet:
        word = letter1+letter2
        df.loc[-1] = [letter1,letter2,data.count(word)]
        df.index = df.index + 1
df.insert(loc=2,column="word",value=df["l1"]+df["l2"])

df

	l1	l2	word	count
728	a	a	aa	0
727	a	b	ab	124
726	a	c	ac	350
725	a	d	ad	789
724	a	e	ae	1
...	...	...	...	...
4		w	w	3634
3		x	x	30
2		y	y	756
1		z	z	10
0				6884

729 rows × 4 columns

len_alpha = len(alphabet)

mat = np.zeros((len_alpha,len_alpha),int)
for i in range(len_alpha):
    for j in range(len_alpha):
        mat[i,j] = df.loc[df.word==(alphabet[i]+alphabet[j]),"count"]
        
df_bi = pd.DataFrame(mat,index=alphabet,columns=alphabet)

import matplotlib.pyplot as plt
from matplotlib import colors

def background_gradient(s, m, M, cmap='PuBu', low=0, high=0):
    rng = M - m
    norm = colors.Normalize(m - (rng * low),
                            M + (rng * high))
    normed = norm(s.values)
    c = [colors.rgb2hex(x) for x in plt.cm.get_cmap(cmap)(normed)]
    return ['background-color: %s' % color for color in c]


df_bi.style.apply(background_gradient,cmap="binary",
               m=df_bi.min().min(),
               M=df_bi.max().max(),
               low=0,
               high=0.85)

# https://stackoverflow.com/a/42563850

	a	b	c	d	e	f	g	h	i	j	k	l	m	n	o	p	q	r	s	t	u	v	w	x	y	z
a	0	124	350	789	1	143	264	12	790	2	156	999	310	3170	0	218	0	1552	1506	1442	147	343	223	36	377	12	921
b	254	9	0	0	857	0	0	0	120	2	0	191	0	0	207	0	0	243	7	6	393	0	0	0	147	0	12
c	798	0	26	0	470	0	0	699	150	0	377	189	0	0	692	0	0	394	0	232	67	0	0	0	21	0	46
d	199	0	6	73	715	12	19	1	358	0	1	130	189	16	781	0	0	94	170	0	29	7	5	0	57	0	6026
e	1165	28	576	2124	806	184	88	27	175	2	17	660	512	1363	68	245	14	2876	1090	472	5	279	147	105	568	1	8664
f	192	0	0	0	246	97	0	0	234	0	0	113	0	0	749	0	0	298	1	157	214	0	1	0	2	0	1284
g	222	0	0	2	410	0	35	444	198	0	0	84	0	4	288	0	0	485	75	3	135	0	0	0	5	0	1197
h	1515	0	0	0	6611	2	0	0	1157	0	34	5	0	0	688	0	0	142	21	299	69	0	2	0	416	0	1016
i	59	89	423	677	424	266	309	0	22	0	71	624	386	2595	393	42	0	440	1048	1504	5	207	0	7	0	68	681
j	8	0	0	0	97	0	0	0	0	0	0	0	0	0	75	0	0	0	0	0	60	0	0	0	0	0	0
k	62	0	0	0	741	10	0	0	269	0	0	27	7	110	1	0	0	0	82	0	0	0	3	0	20	0	588
l	425	2	6	637	1083	90	2	0	828	0	117	1165	12	1	637	52	0	4	77	71	72	58	21	0	628	0	1268
m	770	60	0	0	948	11	0	3	241	0	0	9	34	6	279	88	0	13	119	0	174	0	0	0	221	0	854
n	99	100	242	2476	816	23	1192	13	221	5	204	102	1	85	752	2	14	14	393	691	40	7	1	15	194	0	3050
o	108	35	67	407	62	1044	50	40	134	88	205	320	580	1549	808	245	7	1641	200	1062	1952	196	979	6	43	171	2241
p	220	8	0	2	375	0	2	28	115	0	0	287	3	0	281	166	0	337	35	117	77	0	1	0	67	0	345
q	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	139	0	0	0	0	0	5
r	654	9	71	272	2572	57	137	10	608	0	199	165	181	200	1537	63	0	182	333	334	153	16	16	0	387	0	2378
s	599	1	302	0	943	4	1	870	281	0	197	96	58	25	607	128	6	0	284	1202	190	0	122	0	16	0	3757
t	308	0	182	1	1125	13	0	6453	721	0	0	272	3	4	1762	0	0	497	178	304	159	0	44	0	187	0	4165
u	50	26	128	62	117	34	250	0	108	0	0	547	62	615	0	248	0	503	462	802	0	0	0	0	0	7	579
v	13	0	0	0	1105	0	0	0	108	0	0	0	0	0	45	0	0	0	0	0	0	0	0	0	9	0	10
w	980	0	0	5	794	9	0	711	1031	0	2	24	0	163	616	0	0	19	50	0	2	0	9	0	0	0	805
x	8	0	29	0	36	0	0	1	36	0	0	0	0	0	0	24	0	0	0	37	0	8	0	10	0	0	20
y	16	3	16	0	182	4	0	0	61	0	0	3	4	1	703	3	0	20	101	41	0	0	12	0	0	0	2958
z	47	0	0	0	29	0	0	0	6	0	0	6	0	0	0	0	0	0	0	0	2	0	0	0	7	8	172
	5116	1954	1737	1361	696	1585	1238	2665	2368	141	340	1238	1488	845	2271	942	103	780	3457	7602	516	169	3634	30	756	10	6884

Probabilities Application: Letter Frequencies

Contents

Probabilities Application: Letter Frequencies#

Letters#

Bigrams#

Exercise#

Exercise#

\(p(x="a"|y="n")\)#

\(p(y="n"|x="a")\)#

Exercise#

Visualizing the frequencies#