-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathword2vec_python code
More file actions
158 lines (106 loc) · 4.39 KB
/
word2vec_python code
File metadata and controls
158 lines (106 loc) · 4.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
from lxml import etree
import re
from nltk.tokenize import word_tokenize, sent_tokenize
import numpy as np
from collections import defaultdict
class word2vec():
def __init__(self, setting_n, setting_lr, setting_epochs, settings_window):
self.n = setting_n
self.lr = setting_lr
self.epochs = setting_epochs
self.window = settings_window
def training_data(self, corpus):
word_count = defaultdict(int)
for row in corpus:
for word in row:
word_count[word] += 1
self.v_count = len(word_count.keys())
self.word_list = list(word_count.keys())
self.word_index = dict((word, i) for i, word in enumerate(self.word_list))
self.index_word = dict((i, word) for i, word in enumerate(self.word_list))
train_data = list()
for sentence in corpus:
sent_len = len(sentence)
for i, word in enumerate(sentence):
w_target = self.word2onehot(sentence[i])
w_context = list()
# window 설정 범위로 세팅 (window :3 -> 범위: 7 = 3 * 2 + 1)
for j in range(i - self.window, i + self.window + 1):
if (j != i and j <= sent_len - 1 and j >= 0):
w_context.append(self.word2onehot(sentence[j]))
train_data.append([w_target, w_context])
return np.array(train_data)
# one-hot encoding 해주는 method
def word2onehot(self, word):
word_vec = [0 for i in range(0, self.v_count)]
word_index = self.word_index[word]
word_vec[word_index] = 1
return word_vec
def train(self, train_data):
# 가중치 random 으로 설정
self.w1 = np.random.uniform(-1, 1, (self.v_count, self.n))
self.w2 = np.random.uniform(-1, 1, (self.n, self.v_count))
for i in range(self.epochs):
self.loss = 0
for target, context in train_data:
y_pred, h, u = self.calc(target)
# 에러 계산
error = np.sum([np.subtract(y_pred, word) for word in context], axis=0)
# 역전파
self.backprop(error, h, target)
# loss 계산
self.loss += -np.sum(u[word.index(1)] for word in context) + len(context) * np.log(np.sum(np.exp(u)))
print("Epoch :", i, "Loss :", self.loss)
def calc(self, value):
h = np.dot(self.w1.T, value)
u = np.dot(self.w2.T, h)
y = self.softmax(u)
return y, h, u
# softmax 함수
def softmax(self, x):
return np.exp(x - np.max(x)) / np.exp(x - np.max(x)).sum(axis=0)
# 역전파 구현
def backprop(self, e, h, x):
# 가중치 업데이트
dl_dw1 = np.outer(x, np.dot(self.w2, e.T))
dl_dw2 = np.outer(h, e)
# SGD
self.w1 = self.w1 - (self.lr * dl_dw1)
self.w2 = self.w2 - (self.lr * dl_dw2)
def w_vector(self, word):
w_index = self.word_index[word]
vector_w = self.w1[w_index]
return vector_w
def similarity(self, word):
vec_w1 = self.w_vector(word)
word_sim = dict()
for i in range(self.v_count):
vec_w2 = self.w1[i]
theta_sum = np.dot(vec_w1, vec_w2)
theta_den = np.linalg.norm(vec_w1) * np.linalg.norm(vec_w2)
theta = theta_sum / theta_den
word = self.index_word[i]
word_sim[word] = theta
word_sort = sorted(word_sim.items(), key=lambda x: x[1], reverse=True)
for word, sim in word_sort:
print(word, sim)
# data preprocessing
target_XML = open("ted_en-20160408.xml", "r", encoding="UTF8")
target_text = etree.parse(target_XML)
# <content> </content> 사이의 내용만 추출
parse_text = "\n".join(target_text.xpath("//content/text()"))
# 쓸데 없는 부분들 제거
content_text = re.sub(r'\([^)]*\)', '', parse_text)
sent_text = sent_tokenize(content_text)
normalized_text = []
for string in sent_text:
tokens = re.sub(r"[^a-z0-9]+", " ", string.lower())
normalized_text.append(tokens)
result = [word_tokenize(sentence) for sentence in normalized_text]
# __init__(dimension, learning_rate, epochs, window)
model = word2vec(10, 0.01, 100, 2)
train_data = model.training_data(result[:10])
model.train(train_data)
vector = model.w_vector("fantastic")
print(vector)
model.similarity("fantastic")