import numpy as np


lines = open("/users/brenocon/data/lexical/glove/glove.6B.50d.txt").readlines()


lines[1000]

'themselves 0.64757 -0.88334 0.34135 -1.1426 0.61266 0.14531 -0.39535 0.40737 -0.86006 -0.027515 0.055264 0.34975 -0.15737 0.19496 -0.050624 0.39901 0.69418 -0.29575 0.24179 -0.94232 0.080787 0.48024 0.83972 0.50908 0.067574 -1.2896 -0.67743 -0.45833 0.56853 -1.0018 3.0505 1.0944 -0.35772 -1.1315 -0.26353 0.4984 -0.22614 0.022794 -0.62991 -0.38057 -0.2151 -0.77651 0.4204 0.84677 0.34003 0.049138 -0.90712 -0.56078 -0.14028 -0.69459\n'


vocab = np.array( [L.split()[0] for L in lines]  )


vocab

array(['the', ',', '.', ..., 'rolonda', 'zsombor', 'sandberger'],
      dtype='<U68')


len(vocab)

400000


wordvecs = np.array([ np.array( [float(x) for x in  L.split()[1:]] ) for L in lines])


wordvecs.shape

(400000, 50)


i=np.where(vocab=="ketchup")[0][0]


wordvecs[i,:]

array([ 0.19236 , -0.025945, -0.36305 ,  0.26261 ,  0.15002 ,  0.050377,
        0.044166,  0.17523 , -0.20677 ,  1.1079  , -0.15711 ,  0.76912 ,
        0.41692 , -0.57206 ,  0.18748 , -0.94723 , -0.75695 ,  0.66576 ,
       -0.10706 , -0.73758 , -0.032387, -1.2124  ,  1.4949  ,  0.17428 ,
       -1.0279  , -0.63227 , -1.1102  ,  0.88395 ,  1.4222  , -0.42722 ,
       -0.22668 ,  0.90733 , -0.42899 ,  1.214   ,  0.23611 ,  0.025487,
       -1.105   ,  1.2122  ,  0.85841 , -0.013477,  1.2237  ,  0.18839 ,
       -0.32792 , -0.51933 ,  0.62634 ,  1.0915  , -0.70178 , -0.17399 ,
        0.61597 ,  0.1225  ])


ketchup = wordvecs[np.where(vocab=="ketchup")[0][0], :]
mustard = wordvecs[np.where(vocab=="mustard")[0][0], :]


mustard,ketchup

(array([ 0.13869  , -0.055471 , -0.4075   ,  0.14637  ,  0.51612  ,
         0.27859  ,  0.35174  ,  0.34871  , -0.48427  ,  0.17175  ,
         0.25688  ,  0.27237  ,  0.80843  , -0.3955   , -0.36863  ,
        -0.119    ,  0.18     ,  0.7981   , -0.69538  , -0.71586  ,
        -0.32043  , -0.89718  ,  1.9836   , -0.32445  , -1.3148   ,
        -0.30567  , -0.31968  ,  0.97388  ,  1.6489   ,  0.085363 ,
         0.73834  , -0.45053  , -0.44489  ,  0.45519  ,  0.95134  ,
         1.1111   , -0.75693  ,  0.62207  ,  1.0404   , -0.0040949,
         1.2347   ,  0.063536 ,  0.075837 , -1.0005   ,  2.0768   ,
         1.193    ,  0.11994  ,  0.37672  , -0.32032  , -0.037603 ]),
 array([ 0.19236 , -0.025945, -0.36305 ,  0.26261 ,  0.15002 ,  0.050377,
         0.044166,  0.17523 , -0.20677 ,  1.1079  , -0.15711 ,  0.76912 ,
         0.41692 , -0.57206 ,  0.18748 , -0.94723 , -0.75695 ,  0.66576 ,
        -0.10706 , -0.73758 , -0.032387, -1.2124  ,  1.4949  ,  0.17428 ,
        -1.0279  , -0.63227 , -1.1102  ,  0.88395 ,  1.4222  , -0.42722 ,
        -0.22668 ,  0.90733 , -0.42899 ,  1.214   ,  0.23611 ,  0.025487,
        -1.105   ,  1.2122  ,  0.85841 , -0.013477,  1.2237  ,  0.18839 ,
        -0.32792 , -0.51933 ,  0.62634 ,  1.0915  , -0.70178 , -0.17399 ,
         0.61597 ,  0.1225  ]))


np.linalg.norm(mustard)

5.314349515320761


def cossim(x,y):
    return x.dot(y) / np.linalg.norm(x) / np.linalg.norm(y)


cossim( np.array([ 1,2,3]),  np.array([1,2,3] ))

1.0


cossim( np.array([ 1, -2,-3]),  np.array([1, 2, 3] ))

-0.8571428571428572


cossim(mustard, ketchup)

0.7032056047828732


cossim(ketchup,   wordvecs[np.where(vocab=="tomato")[0][0], :])

0.721217394128946


cossim(ketchup,   wordvecs[np.where(vocab=="blood")[0][0], :])

0.24730949990228326


cossim(ketchup,   wordvecs[np.where(vocab=="sriracha")[0][0], :])

0.3469118335905573


cossim(ketchup,   wordvecs[np.where(vocab=="catsup")[0][0], :])

0.6252614857918791


cossim(ketchup,   wordvecs[np.where(vocab=="hotdog")[0][0], :])

0.2898977721984258


cossim(ketchup,   wordvecs[np.where(vocab=="wasabi")[0][0], :])

0.5563536184261586


scores = np.array([ cossim(ketchup,  wordvecs[w, :])  for w in range(len(vocab)) ])


np.argsort(-scores)

array([ 31156,  27384,  16883, ..., 206876, 219128, 339363])


for w in np.argsort(-scores)[:10]:
    print(w, vocab[w], scores[w] )

31156 ketchup 1.0
27384 mayonnaise 0.8362772686710104
16883 vodka 0.8055906780062482
28043 chocolates 0.7622651419344872
6892 sauce 0.7355295730490348
14639 vinegar 0.7348323781604711
47002 pesto 0.7313647399802565
62307 guacamole 0.7239271487594593
12333 tomato 0.721217394128946
57748 condiments 0.7062116164501253


target = wordvecs[np.where(vocab=="computer")[0][0], :]
scores = np.array([ cossim(target,  wordvecs[w, :])  for w in range(len(vocab)) ])
for w in np.argsort(-scores)[:10]:
    print(w, vocab[w], scores[w] )

951 computer 1.0000000000000002
2802 computers 0.9165044765653498
1516 software 0.8814993634710457
732 technology 0.8525559133429749
2505 electronic 0.8125868044629013
925 internet 0.8060455558122077
8443 computing 0.8026036233505297
3461 devices 0.8016185204075192
2150 digital 0.7991792346702773
3310 applications 0.7912740180594209


target = wordvecs[np.where(vocab=="anime")[0][0], :]
scores = np.array([ cossim(target,  wordvecs[w, :])  for w in range(len(vocab)) ])
for w in np.argsort(-scores)[:10]:
    print(w, vocab[w], scores[w] )

11634 anime 0.9999999999999999
12022 manga 0.8740692471585852
33056 live-action 0.8383249844211734
6092 animated 0.7918400550833321
21996 spin-off 0.7838730656022157
48754 adaption 0.7736955789707384
40448 ova 0.740956297276696
37369 sci-fi 0.7396066293878305
47480 comedy-drama 0.7334042803097398
7362 cartoon 0.7268918830370767


target = wordvecs[np.where(vocab=="anime")[0][0], :]
scores = np.array([ cossim(target,  wordvecs[w, :])  for w in range(len(vocab)) ])
for w in np.argsort(scores)[:10]:
    print(w, vocab[w], scores[w] )

240280 labarsouque -0.6038089140998685
28021 tih -0.5951866969062908
229437 fergushill -0.5598614123482951
350747 mussayab -0.5540935049783946
248129 górowo -0.5408055866219993
347873 komaroff -0.540001935345538
302741 diliani -0.5363593611938349
313266 centegra -0.5351073688171836
253713 iławeckie -0.5341987495738652
183564 josephat -0.532576622579604