0%

复旦NLP-Beginner任务一

任务:FudanNLP/nlp-beginner 的任务一。

任务一:基于机器学习的文本分类

实现基于logistic/softmax regression的文本分类

  1. 参考
    1. 文本分类
    2. 神经网络与深度学习》 第2/3章
  2. 数据集:Classify the sentiment of sentences from the Rotten Tomatoes dataset
  3. 实现要求:NumPy
  4. 需要了解的知识点:
    1. 文本特征表示:Bag-of-Word,N-gram
    2. 分类器:logistic/softmax regression,损失函数、(随机)梯度下降、特征选择
    3. 数据集:训练集/验证集/测试集的划分
  5. 实验:
    1. 分析不同的特征、损失函数、学习率对最终分类性能的影响
    2. shuffle 、batch、mini-batch
  6. 时间:两周

以上是任务要求。然而,我考虑到 Bag-of-Word、N-gram 较为传统,不一定能很好地完成分类任务,于是决定尝试使用 Word2Vec,并在Classify the sentiment of sentences from the Rotten Tomatoes dataset上提交。

一言以蔽之,Word2Vec(用 Skip-gram 模型)就是,训练得到两个矩阵:中心词的输入向量矩阵(in_embed),上下文词的输出向量矩阵(out_embed)。这两个矩阵通过最大化中心词与其上下文词的相似度(点积经过 softmax 转换为概率)来共同优化,从而让语义相关的词在向量空间中更加接近。

训练完 Word2Vec 后,词嵌入具有了语义,将其均值视为句子的向量,再学习一个分类器来分类。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from tqdm import tqdm


class Word2VecDataset(Dataset):
def __init__(self, pairs):
self.pairs = pairs
def __len__(self):
return len(self.pairs)
def __getitem__(self, idx):
return self.pairs[idx]

class Word2Vec(nn.Module):
def __init__(self, vocab_size, embedding_dim):
super().__init__()
self.in_embed = nn.Embedding(vocab_size, embedding_dim)
self.out_embed = nn.Embedding(vocab_size, embedding_dim)
def forward(self, center_idx):
center_vec = self.in_embed(center_idx)
scores = torch.matmul(center_vec, self.out_embed.weight.T)
log_probs = nn.functional.log_softmax(scores, dim=1)
return log_probs

class SentimentDataset(Dataset):
def __init__(self, phrases, labels, word_to_vec, dim):
self.vectors = []
self.labels = labels.values if labels is not None else None
for phrase in tqdm(phrases, desc="Processing phrases for Dataset", ncols=80):
tokens = word_tokenize(phrase.lower())
vectors = [word_to_vec[word] for word in tokens if word in word_to_vec]
if vectors:
avg_vec = np.mean(vectors, axis=0)
else:
avg_vec = np.zeros(dim)
self.vectors.append(torch.tensor(avg_vec, dtype=torch.float32))
def __len__(self):
return len(self.vectors)
def __getitem__(self, idx):
if self.labels is not None:
return self.vectors[idx], self.labels[idx]
else:
return self.vectors[idx]

class SentimentClassifier(nn.Module):
def __init__(self, input_dim, output_dim):
super().__init__()
self.fc = nn.Linear(input_dim, output_dim)
def forward(self, x):
return self.fc(x)

if __name__ == '__main__':

# nltk.download('punkt')
# 1. 加载数据
train_df = pd.read_csv("train.tsv", sep="\t")
test_df = pd.read_csv("test.tsv", sep="\t")

# 2. 构建词汇表
all_phrases = pd.concat([train_df['Phrase'], test_df['Phrase']], ignore_index=True)
tokenized_phrases = [word_tokenize(p.lower()) for p in all_phrases]

tokens = [word for phrase in tokenized_phrases for word in phrase]
vocab = sorted(set(tokens))
word_to_idx = {word: i for i, word in enumerate(vocab)}
vocab_size = len(vocab)

# 3. 构建 Word2Vec 训练数据
window_size = 2
training_pairs = []
for phrase in tqdm(tokenized_phrases, desc="Building training pairs", ncols=80):
for i, center_word in enumerate(phrase):
if center_word not in word_to_idx:
continue
center_idx = word_to_idx[center_word]
for j in range(-window_size, window_size + 1):
if j == 0 or i + j < 0 or i + j >= len(phrase):
continue
context_word = phrase[i + j]
if context_word in word_to_idx:
context_idx = word_to_idx[context_word]
training_pairs.append((center_idx, context_idx))

# 4. 初始化模型、损失、优化器、DataLoader等
embedding_dim = 100
w2v_model = Word2Vec(vocab_size, embedding_dim)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"using {device}")
w2v_model.to(device)

criterion = nn.NLLLoss()
optimizer = optim.Adam(w2v_model.parameters(), lr=0.001)

batch_size = 512
train_dataset = Word2VecDataset(training_pairs)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)

# 5. 训练 Word2Vec
for epoch in range(5):
w2v_model.train()
total_loss = 0
for center_batch, context_batch in tqdm(train_loader, desc=f"Word2Vec Epoch {epoch+1}", ncols=80):
center_batch = center_batch.to(device)
context_batch = context_batch.to(device)
optimizer.zero_grad()
log_probs = w2v_model(center_batch)
loss = criterion(log_probs, context_batch)
loss.backward()
optimizer.step()
total_loss += loss.item() * center_batch.size(0)
avg_loss = total_loss / len(train_dataset)
print(f"Epoch {epoch+1}, Loss: {avg_loss:.4f}")

# 6. 获取词向量
word_vectors = w2v_model.in_embed.weight.data.cpu().numpy()
word_to_vec = {word: word_vectors[word_to_idx[word]] for word in vocab}

# 7. 构建情感数据集
train_data, val_data = train_test_split(train_df, test_size=0.1, random_state=42)
train_dataset = SentimentDataset(train_data['Phrase'], train_data['Sentiment'], word_to_vec, embedding_dim)
val_dataset = SentimentDataset(val_data['Phrase'], val_data['Sentiment'], word_to_vec, embedding_dim)
test_dataset = SentimentDataset(test_df['Phrase'], None, word_to_vec, embedding_dim)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)
test_loader = DataLoader(test_dataset, batch_size=64)

# 8. 定义情感分类模型
model = SentimentClassifier(embedding_dim, 5).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 9. 训练情感分类器
for epoch in range(10):
model.train()
total_loss = 0
for x, y in tqdm(train_loader, desc=f"Sentiment Training Epoch {epoch+1}", ncols=80):
x, y = x.to(device), torch.tensor(y).to(device)
optimizer.zero_grad()
out = model(x)
loss = loss_fn(out, y)
loss.backward()
optimizer.step()
total_loss += loss.item()
print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader):.4f}")

# 10. 预测测试集并保存结果
model.eval()
all_preds = []
with torch.no_grad():
for x in tqdm(test_loader, desc="Predicting Test Set", ncols=80):
x = x.to(device)
out = model(x)
preds = torch.argmax(out, dim=1)
all_preds.extend(preds.cpu().numpy())

submission = pd.DataFrame({
'PhraseId': test_df['PhraseId'],
'Sentiment': all_preds
})
submission.to_csv("sampleSubmission.csv", index=False)
print("预测完成,结果已保存为 sampleSubmission.csv")

收敛了:

1
2
3
4
5
6
7
8
9
Epoch 7, Loss: 1.2165
Sentiment Training Epoch 8: 100%|██████████| 2195/2195 [00:03<00:00, 608.70it/s]
Epoch 8, Loss: 1.2164
Sentiment Training Epoch 9: 100%|██████████| 2195/2195 [00:03<00:00, 594.64it/s]
Epoch 9, Loss: 1.2163
Sentiment Training Epoch 10: 100%|█████████| 2195/2195 [00:04<00:00, 543.85it/s]
Epoch 10, Loss: 1.2163
Predicting Test Set: 100%|████████████████| 1036/1036 [00:00<00:00, 1111.47it/s]
预测完成,结果已保存为 sampleSubmission.csv

提交:

1

结果有点差劲,试试把线性分类器改为非线性的:

1
2
3
4
5
6
7
8
9
10
11
class SentimentClassifier(nn.Module):
def __init__(self, input_dim, hidden_dim, output_dim):
super().__init__()
self.fc1 = nn.Linear(input_dim, hidden_dim)
self.relu = nn.ReLU()
self.fc2 = nn.Linear(hidden_dim, output_dim)
def forward(self, x):
x = self.fc1(x)
x = self.relu(x)
x = self.fc2(x)
return x

这里也要改:

1
model = SentimentClassifier(embedding_dim, 128, 5).to(device)

此外,感觉 Word2Vec 没有很收敛,加倍一下 epoch,

1
for epoch in range(10):

收敛了,节选一些数据:

1
2
3
4
5
6
Word2Vec Epoch 8: 100%|████████████████████| 9749/9749 [00:53<00:00, 182.00it/s]
Epoch 8, Loss: 5.2779
Word2Vec Epoch 9: 100%|████████████████████| 9749/9749 [00:57<00:00, 168.65it/s]
Epoch 9, Loss: 5.2461
Word2Vec Epoch 10: 100%|███████████████████| 9749/9749 [00:57<00:00, 170.31it/s]
Epoch 10, Loss: 5.2214

以及,

1
2
3
4
5
6
7
8
Sentiment Training Epoch 7: 100%|██████████| 2195/2195 [00:02<00:00, 966.91it/s]
Epoch 7, Loss: 1.0158
Sentiment Training Epoch 8: 100%|██████████| 2195/2195 [00:02<00:00, 967.65it/s]
Epoch 8, Loss: 1.0073
Sentiment Training Epoch 9: 100%|█████████| 2195/2195 [00:01<00:00, 1115.54it/s]
Epoch 9, Loss: 1.0002
Sentiment Training Epoch 10: 100%|█████████| 2195/2195 [00:02<00:00, 993.63it/s]
Epoch 10, Loss: 0.9938

最后结果是:

2.png

结果进步了一些。