Source code for cadl.glove

"""Global Vector Embeddings.
"""
"""
Copyright 2017 Parag K. Mital.  See also NOTICE.md.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import numpy as np
import matplotlib.pyplot as plt
from cadl import utils
import zipfile
from scipy.spatial import distance, distance_matrix
from sklearn.decomposition import PCA


[docs]def get_model(): """Summary Returns ------- TYPE Description """ # Download the glove model and open a zip file file = utils.download('http://nlp.stanford.edu/data/wordvecs/glove.6B.zip') zf = zipfile.ZipFile(file) # Collect the words and their vectors words = [] vectors = [] for l in zf.open("glove.6B.300d.txt"): t = l.strip().split() words.append(t[0].decode()) vectors.append(list(map(np.double, t[1:]))) # Store as a lookup table wordvecs = np.asarray(vectors, dtype=np.double) word2id = {word: i for i, word in enumerate(words)} return wordvecs, word2id, words
[docs]def course_example(): """Summary """ wordvecs, word2id, words = get_model() word = '2000' print(word2id[word]) print(wordvecs[word2id[word]]) # Get distances to target word target_vec = wordvecs[word2id[word]] dists = [] for vec_i in wordvecs: dists.append(distance.cosine(target_vec, vec_i)) k = 20 # Print top nearest words idxs = np.argsort(dists) for idx_i in idxs[:k]: print(words[idx_i], dists[idx_i]) # Plot top nearest words labels = [words[idx_i] for idx_i in idxs[:k]] plt.figure() plt.bar(range(k), [dists[idx_i] for idx_i in idxs[:k]]) ax = plt.gca() ax.set_xticks(range(len(labels))) ax.set_xticklabels(labels, rotation='vertical') plt.xlabel('label') plt.ylabel('distances') # Create distance matrix vecs = [wordvecs[idx_i] for idx_i in idxs[:k]] dm = distance_matrix(vecs, vecs) plt.figure() plt.imshow(dm) ax = plt.gca() ax.set_xticks(range(len(labels))) ax.set_yticks(range(len(labels))) ax.set_xticklabels(labels, rotation='vertical') ax.set_yticklabels(labels) plt.colorbar() # Plot data points in reduced dimensionality using principal components # of the distance matrix res = PCA(2).fit_transform(dm / np.mean(dm, axis=0, keepdims=True)) pc1, pc2 = res[:, 0], res[:, 1] plt.figure() plt.scatter(pc1, pc2) for i in range(len(labels)): plt.text(pc1[i], pc2[i], labels[i]) # Let's stick it all in a function and explore some other words: def plot_nearest_words(word, k=20): """Summary Parameters ---------- word : TYPE Description k : int, optional Description """ # Get distances to target word target_vec = wordvecs[word2id[word]] dists = [] for vec_i in wordvecs: dists.append(distance.cosine(target_vec, vec_i)) idxs = np.argsort(dists) labels = [words[idx_i] for idx_i in idxs[:k]] vecs = [wordvecs[idx_i] for idx_i in idxs[:k]] dm = distance_matrix(vecs, vecs) fig, axs = plt.subplots(1, 2, figsize=(10, 4)) # Create distance matrix axs[0].imshow(dm) axs[0].set_xticks(range(len(labels))) axs[0].set_yticks(range(len(labels))) axs[0].set_xticklabels(labels, rotation='vertical') axs[0].set_yticklabels(labels) # Center the distance matrix dm = dm / np.mean(dm, axis=0, keepdims=True) # Plot data points in reduced dimensionality using principal components # of the distance matrix res = PCA(2).fit_transform(dm) pc1, pc2 = res[:, 0], res[:, 1] axs[1].scatter(pc1, pc2) for i in range(len(labels)): axs[1].text(pc1[i], pc2[i], labels[i]) plot_nearest_words('2000') plot_nearest_words('intelligence') # What else can we explore? Well this embedding is "linear" meaning we can # actually try performing arithmetic in this space. A classic example is what # happens when we perform: "man" - "king" + "woman"? Or in other words, can the # word embedding understand analogies? For instance, if man is to king as woman # is to queen, then we should be able to subtract man and king, and add woman # to see the result of the analogy. # Let's create a function which will return us the nearest words rather than # plot them: def get_nearest_words(target_vec, k=20): """Summary Parameters ---------- target_vec : TYPE Description k : int, optional Description Returns ------- TYPE Description """ # Get distances to target vector dists = [] for vec_i in wordvecs: dists.append(distance.cosine(target_vec, vec_i)) # Get top nearest words idxs = np.argsort(dists) res = [] for idx_i in idxs[:k]: res.append((words[idx_i], dists[idx_i])) return res # And a convenience function for returning a vector def get_vector(word): """Summary Parameters ---------- word : TYPE Description Returns ------- TYPE Description """ return wordvecs[word2id[word]] # Now we can try some word embedding arithmetic get_nearest_words(get_vector('king') - get_vector('man') + get_vector('woman')) get_nearest_words(get_vector('france') - get_vector('french') + get_vector('spain'))