目录
word2vec练习

原文

pytorch_word2vec_model.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F


class SkipGram(nn.Module):
def __init__(self, vocab_size, embd_size):
super(SkipGram, self).__init__()
self.embeddings = nn.Embedding(vocab_size, embd_size)

def forward(self, focus, context):
embed_focus = self.embeddings(focus)
embed_ctx = self.embeddings(context)
# score = torch.mm(embed_focus, torch.t(embed_ctx))
score = torch.mul(embed_focus, embed_ctx).sum(dim=1)
log_probs = score #F.logsigmoid(score)

return log_probs

def loss(self, log_probs, target):
loss_fn = nn.BCEWithLogitsLoss()
# loss_fn = nn.NLLLoss()
loss = loss_fn(log_probs, target)
return loss

pytorch_train.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import random
import re

import torch
import torch.optim as optim
from tqdm import tqdm
from pytorch_word2vec_model import SkipGram

epochs = 50
negative_sampling = 4
window = 2
vocab_size = 1
embd_size = 300

device = "cuda" if torch.cuda.is_available() is True else "cpu"

def batch_data(x, batch_size=128):
in_w = []
out_w = []
target = []
for text in x:
for i in range(window, len(text) - window):
word_set = set()
in_w.append(text[i])
in_w.append(text[i])
in_w.append(text[i])
in_w.append(text[i])

out_w.append(text[i - 2])
out_w.append(text[i - 1])
out_w.append(text[i + 1])
out_w.append(text[i + 2])

target.append(1)
target.append(1)
target.append(1)
target.append(1)
# negative sampling
count = 0
while count < negative_sampling:
rand_id = random.randint(0, vocab_size-1)
if not rand_id in word_set:
in_w.append(text[i])
out_w.append(rand_id)
target.append(0)
count += 1

if len(out_w) >= batch_size:
yield [in_w, out_w, target]
in_w = []
out_w = []
target = []
if out_w:
yield [in_w, out_w, target]


def train(train_text_id, model,opt):
model.train() # 启用dropout和batch normalization
ave_loss = 0
pbar = tqdm()
cnt=0
for x_batch in batch_data(train_text_id):
in_w, out_w, target = x_batch
in_w_var = torch.tensor(in_w).to(device)
out_w_var = torch.tensor(out_w).to(device)
target_var = torch.tensor(target,dtype=torch.float).to(device)


model.zero_grad()
log_probs = model(in_w_var, out_w_var)
loss = model.loss(log_probs, target_var)
loss.backward()
opt.step()
ave_loss += loss.item()
pbar.update(1)
cnt += 1
pbar.set_description('< loss: %.5f >' % (ave_loss / cnt))
pbar.close()
text_id = []
vocab_dict = {}

with open(
'corpus.txt',
encoding='utf-8') as fp:
for line in fp:
lines = re.sub("[^A-Za-z0-9']+", ' ', line).lower().split()
line_id = []
for s in lines:
if not s:
continue
if s not in vocab_dict:
vocab_dict[s] = len(vocab_dict)
id = vocab_dict[s]
line_id.append(id)
if id==11500:
print(id,s)
text_id.append(line_id)
vocab_size = len(vocab_dict)
print('vocab_size', vocab_size)
model = SkipGram(vocab_size, embd_size).to(device)

for epoch in range(epochs):
print('epoch', epoch)
opt = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()),
lr=0.001, weight_decay=0)
train(text_id, model,opt)

logits的作用是把\((0,1)\)的数值,变到\((-\infty, \infty)\)。如果是分类,假设是0、1分类,1的概率是\(p\),那变化公式是\(\frac{p}{1-p}\),一开始我以为是类似于\(\frac{某个概率}{1-这个概率}\)这种形式,但是后来学到Logistic模型,发现好像不是这么回事,假设是N分类,则更像是\(\frac{p_k}{p_N}\),其中\(k=1,2,...,N-1\)

代码里BCEWithLogitsLoss()是把\((-\infty, \infty)\)的数值,变到\((0,1)\),通过sigmoid函数就可以实现。而其实,sigmoid函数就是从这里推导出来的。

文章作者: Ash's Blogs
文章链接: http://yoursite.com/2019/11/24/word2vec%E7%BB%83%E4%B9%A0/
版权声明: 本博客所有文章除特别声明外,均采用 CC BY-NC-SA 4.0 许可协议。转载请注明来自 Hexo