import numpy as np

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [5.2,5] ## fairly square plots
#plt.rcParams['figure.figsize'] = [14,6]  ## fills notebook width
%matplotlib inline


lines = open("/users/brenocon/data/lexical/glove/glove.6B.50d.txt").readlines()


lines[1000]

'themselves 0.64757 -0.88334 0.34135 -1.1426 0.61266 0.14531 -0.39535 0.40737 -0.86006 -0.027515 0.055264 0.34975 -0.15737 0.19496 -0.050624 0.39901 0.69418 -0.29575 0.24179 -0.94232 0.080787 0.48024 0.83972 0.50908 0.067574 -1.2896 -0.67743 -0.45833 0.56853 -1.0018 3.0505 1.0944 -0.35772 -1.1315 -0.26353 0.4984 -0.22614 0.022794 -0.62991 -0.38057 -0.2151 -0.77651 0.4204 0.84677 0.34003 0.049138 -0.90712 -0.56078 -0.14028 -0.69459\n'


vocab = np.array( [L.split()[0] for L in lines]  )


vocab

array(['the', ',', '.', ..., 'rolonda', 'zsombor', 'sandberger'],
      dtype='<U68')


len(vocab)

400000


random.choice(vocab)

'žrk'


# Parse the plain text numbers into a matrix (numpy array.. not numpy "matrix" those are silly)
wordvecs = np.array([ np.array( [float(x) for x in  L.split()[1:]] ) for L in lines])


wordvecs.shape

(400000, 50)


i=100000;  vocab[i], wordvecs[i,:]

('capron',
 array([-0.75282  , -0.13948  , -0.11653  ,  0.014349 , -0.13615  ,
         0.53752  ,  0.017668 ,  0.44476  , -0.66653  , -0.33795  ,
        -0.42435  ,  0.6666   ,  0.15256  , -0.99617  , -0.5356   ,
         0.018101 ,  0.86437  ,  0.38901  , -0.44585  ,  0.048923 ,
         0.48488  , -0.21526  , -0.36698  , -0.29817  ,  0.8916   ,
         0.15681  ,  0.4978   , -0.20641  , -0.34411  ,  0.15804  ,
        -1.5266   , -0.99805  ,  0.94599  ,  0.32249  ,  0.19855  ,
         0.0061474,  0.19099  ,  0.025905 ,  0.67375  ,  0.79916  ,
         0.52587  ,  0.28146  , -0.6072   ,  0.36219  ,  0.45485  ,
        -0.032826 , -0.075069 ,  0.035115 ,  0.11544  , -0.14412  ]))


np.where(vocab=="life")

(array([214]),)


i=np.where(vocab=="life")[0][0]


wordvecs[i,:]

array([ 0.51491 ,  0.88806 , -0.71906 , -0.5748  ,  0.85655 ,  0.52474 ,
       -0.31788 , -0.20168 ,  0.17936 ,  0.51999 , -0.11527 ,  0.59296 ,
       -0.3468  ,  0.052568,  0.87153 , -0.036582, -0.056057,  0.08516 ,
        0.036249,  0.23403 ,  0.073175,  1.1394  , -0.17921 , -0.034245,
        0.69977 , -1.6516  , -1.106   , -0.44145 ,  0.77042 ,  0.23963 ,
        3.1823  , -0.020451, -0.056117, -0.69918 , -0.19543 ,  0.19492 ,
       -0.36403 ,  0.053196,  0.26225 , -0.29054 , -0.64883 , -0.057846,
        0.21646 ,  0.40237 , -0.1413  , -0.015453, -0.11988 , -0.99837 ,
       -0.066328,  0.13118 ])


life = wordvecs[np.where(vocab=="life")[0][0], :]
death = wordvecs[np.where(vocab=="death")[0][0], :]


life,death

(array([ 0.51491 ,  0.88806 , -0.71906 , -0.5748  ,  0.85655 ,  0.52474 ,
        -0.31788 , -0.20168 ,  0.17936 ,  0.51999 , -0.11527 ,  0.59296 ,
        -0.3468  ,  0.052568,  0.87153 , -0.036582, -0.056057,  0.08516 ,
         0.036249,  0.23403 ,  0.073175,  1.1394  , -0.17921 , -0.034245,
         0.69977 , -1.6516  , -1.106   , -0.44145 ,  0.77042 ,  0.23963 ,
         3.1823  , -0.020451, -0.056117, -0.69918 , -0.19543 ,  0.19492 ,
        -0.36403 ,  0.053196,  0.26225 , -0.29054 , -0.64883 , -0.057846,
         0.21646 ,  0.40237 , -0.1413  , -0.015453, -0.11988 , -0.99837 ,
        -0.066328,  0.13118 ]),
 array([ 4.9089e-01,  3.2534e-01,  1.4417e-03, -8.4331e-01,  6.5698e-01,
         1.0089e+00,  2.4760e-01,  6.8178e-01,  1.3553e-01,  1.8069e-01,
        -5.0148e-02, -1.2704e-01, -4.8039e-01, -5.5024e-01,  1.5247e+00,
        -4.3059e-01, -8.8914e-01, -5.8294e-01, -7.4690e-01,  5.8781e-01,
        -8.1227e-02,  4.8804e-01,  2.9895e-01, -4.0216e-01,  3.1343e-01,
        -2.4331e+00, -8.3064e-01, -7.7042e-01,  2.9021e-01,  5.6141e-01,
         2.3164e+00, -7.1611e-01, -5.5002e-01, -6.1970e-01,  1.6241e-01,
         8.4582e-02,  9.5919e-01, -7.1344e-01,  4.7596e-01,  3.0941e-01,
        -8.5852e-01,  5.4195e-01, -2.5035e-01, -1.8671e-01,  4.5843e-01,
         3.9168e-02, -5.6138e-01, -1.1905e+00,  2.1134e-01, -5.9986e-01]))


plt.scatter(life,death)   # each point is one dimension.  not really meaningful

<matplotlib.collections.PathCollection at 0x7fd1de1c40a0>


life.dot(death)

18.884996175899


np.linalg.norm(life)

4.895247333451396


def cossim(x,y):
    return x.dot(y) / np.linalg.norm(x) / np.linalg.norm(y)


cossim( np.array([ 1,2,3]),  np.array([1,2,10] ))

0.9128709291752769


cossim( np.array([ 1, -2,-3]),  np.array([1, 2, 3] ))

-0.8571428571428572


cossim(life, death)

0.7264110537105964


w=5002;  cossim(life,  wordvecs[w,:])

0.5360036450897964


scores = np.array(  [ cossim(life,  wordvecs[w, :])  for w in range(len(vocab)) ]  )


scores

array([ 0.70148617,  0.72279564,  0.78045729, ..., -0.34808466,
       -0.56667139, -0.45042888])


np.argsort(  [100000, 20, 30]  )

array([1, 2, 0])


np.argsort(-scores)

array([   214,   1676,    835, ..., 374068, 384008, 169096])


w=np.argsort(-scores)[:10]
vocab[w]

array(['life', 'mind', 'love', 'lives', 'own', 'kind', 'experience',
       'child', 'perhaps', 'she'], dtype='<U68')


for w in np.argsort(-scores)[:50]:
    print(scores[w], vocab[w])

1.0000000000000002 life
0.8514841863412058 mind
0.8403438472391277 love
0.8392689150907717 lives
0.8369905081691957 own
0.8338872609897519 kind
0.8213189099822804 experience
0.8168196357102481 child
0.8082367683717586 perhaps
0.8081038679210564 she
0.8071581706010772 whose
0.8049614685826322 indeed
0.8037770289741493 her
0.802373620355454 same
0.8022423092983956 work
0.8017044107871999 true
0.8002954360209007 way
0.8001490924492579 once
0.799655904040989 fact
0.7994166192262724 this
0.7988869333380826 much
0.7985275903650746 how
0.798180079965251 even
0.7978290205117401 what
0.7951681282559191 actually
0.7948194446857774 thought
0.7947627270422303 as
0.7946196651052868 sort
0.7941117143169732 seeing
0.793220754207921 finds
0.7923321747742362 so
0.792241736627864 brought
0.7903729799690244 goes
0.7894630246685433 well
0.7886037370145023 always
0.7875016505301097 couple
0.7859511112946416 my
0.7855102118651646 something
0.7846166295423811 one
0.7843958845695762 gone
0.7842895404911405 great
0.7829147342143828 our
0.7818606363822782 time
0.7811810935601861 good
0.780949296803288 age
0.7808370060619523 still
0.7804572898798426 .
0.7802388200085802 rather
0.7793383779556715 nothing
0.7789931308335858 years


x = wordvecs[np.where(vocab=="table")[0][0], :]
scores = np.array(  [ cossim(x,  wordvecs[w, :])  for w in range(len(vocab)) ]  )
for w in np.argsort(-scores)[:50]:
    print(scores[w], vocab[w])

1.0 table
0.8177437175469351 tables
0.7994711524534881 place
0.7571903424691973 sit
0.7330496092265292 set
0.7316289338715505 open
0.7306693840300171 hold
0.7287872495141072 here
0.7207204399247122 each
0.7166720276212438 bottom
0.7153381170200149 top
0.710887528452398 room
0.7035431366064223 full
0.6953041429620398 next
0.6948687598167882 sitting
0.6913973405308655 pool
0.6912088964342343 door
0.6891001882776997 side
0.6885255075223766 stand
0.6869691314529636 wrap
0.6857859147505004 setting
0.6839622695235154 final
0.6828282463422229 spot
0.6817818796614848 opening
0.6808672636507198 dressing
0.676198457502724 instead
0.6727297916369688 placed
0.6715795041895688 round
0.6702148416912516 finish
0.6702129327164665 cup
0.6701828691261177 draw
0.6688733106588504 standing
0.667382356019513 go
0.6672315311436365 placing
0.6663863282107575 put
0.6659159090845219 filled
0.6630361303642418 time
0.6625288477971126 holds
0.6612416202208715 aside
0.6594355919017335 shape
0.6576598660473933 lunch
0.6572609471224473 add
0.6567405555397445 regular
0.6558957086877858 sides
0.6549552436713014 take
0.6548200154002345 row
0.6547831567768505 one
0.6527954691838531 wrapped
0.6514518981593479 with
0.6506336166696974 usually


x = wordvecs[np.where(vocab=="car")[0][0], :]
scores = np.array(  [ cossim(x,  wordvecs[w, :])  for w in range(len(vocab)) ]  )
for w in np.argsort(-scores)[:10]:
    print(scores[w], vocab[w])

1.0 car
0.9208586184905553 truck
0.8870189657270604 cars
0.8833684148214744 vehicle
0.8464018882420629 driver
0.838418921664126 driving
0.8210511277325926 bus
0.8174993206120538 vehicles
0.7902189195323365 parked
0.7866502930143751 motorcycle


x = wordvecs[np.where(vocab=="car")[0][0], :]
y = wordvecs[np.where(vocab=="trucks")[0][0], :]
cossim(x,y)

0.7730797227498432