复旦NLP-Beginner任务一

任务：FudanNLP/nlp-beginner 的任务一。

任务一：基于机器学习的文本分类

实现基于logistic/softmax regression的文本分类

参考

文本分类

《神经网络与深度学习》第2/3章

数据集：Classify the sentiment of sentences from the Rotten Tomatoes dataset

实现要求：NumPy

需要了解的知识点：

文本特征表示：Bag-of-Word，N-gram

分类器：logistic/softmax regression，损失函数、（随机）梯度下降、特征选择

数据集：训练集/验证集/测试集的划分

实验：

分析不同的特征、损失函数、学习率对最终分类性能的影响

shuffle 、batch、mini-batch

时间：两周

以上是任务要求。然而，我考虑到 Bag-of-Word、N-gram 较为传统，不一定能很好地完成分类任务，于是决定尝试使用 Word2Vec，并在Classify the sentiment of sentences from the Rotten Tomatoes dataset上提交。

一言以蔽之，Word2Vec（用 Skip-gram 模型）就是，训练得到两个矩阵：中心词的输入向量矩阵（in_embed），上下文词的输出向量矩阵（out_embed）。这两个矩阵通过最大化中心词与其上下文词的相似度（点积经过 softmax 转换为概率）来共同优化，从而让语义相关的词在向量空间中更加接近。

训练完 Word2Vec 后，词嵌入具有了语义，将其均值视为句子的向量，再学习一个分类器来分类。

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from tqdm import tqdm


class Word2VecDataset(Dataset):
    def __init__(self, pairs):
        self.pairs = pairs
    def __len__(self):
        return len(self.pairs)
    def __getitem__(self, idx):
        return self.pairs[idx]

class Word2Vec(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.in_embed = nn.Embedding(vocab_size, embedding_dim)
        self.out_embed = nn.Embedding(vocab_size, embedding_dim)
    def forward(self, center_idx):
        center_vec = self.in_embed(center_idx)
        scores = torch.matmul(center_vec, self.out_embed.weight.T)
        log_probs = nn.functional.log_softmax(scores, dim=1)
        return log_probs

class SentimentDataset(Dataset):
    def __init__(self, phrases, labels, word_to_vec, dim):
        self.vectors = []
        self.labels = labels.values if labels is not None else None
        for phrase in tqdm(phrases, desc="Processing phrases for Dataset", ncols=80):
            tokens = word_tokenize(phrase.lower())
            vectors = [word_to_vec[word] for word in tokens if word in word_to_vec]
            if vectors:
                avg_vec = np.mean(vectors, axis=0)
            else:
                avg_vec = np.zeros(dim)
            self.vectors.append(torch.tensor(avg_vec, dtype=torch.float32))
    def __len__(self):
        return len(self.vectors)
    def __getitem__(self, idx):
        if self.labels is not None:
            return self.vectors[idx], self.labels[idx]
        else:
            return self.vectors[idx]

class SentimentClassifier(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.fc = nn.Linear(input_dim, output_dim)
    def forward(self, x):
        return self.fc(x)

if __name__ == '__main__':

    # nltk.download('punkt')
    # 1. 加载数据
    train_df = pd.read_csv("train.tsv", sep="\t")
    test_df = pd.read_csv("test.tsv", sep="\t")

    # 2. 构建词汇表
    all_phrases = pd.concat([train_df['Phrase'], test_df['Phrase']], ignore_index=True)
    tokenized_phrases = [word_tokenize(p.lower()) for p in all_phrases]

    tokens = [word for phrase in tokenized_phrases for word in phrase]
    vocab = sorted(set(tokens))
    word_to_idx = {word: i for i, word in enumerate(vocab)}
    vocab_size = len(vocab)

    # 3. 构建 Word2Vec 训练数据
    window_size = 2
    training_pairs = []
    for phrase in tqdm(tokenized_phrases, desc="Building training pairs", ncols=80):
        for i, center_word in enumerate(phrase):
            if center_word not in word_to_idx:
                continue
            center_idx = word_to_idx[center_word]
            for j in range(-window_size, window_size + 1):
                if j == 0 or i + j < 0 or i + j >= len(phrase):
                    continue
                context_word = phrase[i + j]
                if context_word in word_to_idx:
                    context_idx = word_to_idx[context_word]
                    training_pairs.append((center_idx, context_idx))

    # 4. 初始化模型、损失、优化器、DataLoader等
    embedding_dim = 100
    w2v_model = Word2Vec(vocab_size, embedding_dim)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"using {device}")
    w2v_model.to(device)

    criterion = nn.NLLLoss()
    optimizer = optim.Adam(w2v_model.parameters(), lr=0.001)

    batch_size = 512
    train_dataset = Word2VecDataset(training_pairs)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)

    # 5. 训练 Word2Vec
    for epoch in range(5):
        w2v_model.train()
        total_loss = 0
        for center_batch, context_batch in tqdm(train_loader, desc=f"Word2Vec Epoch {epoch+1}", ncols=80):
            center_batch = center_batch.to(device)
            context_batch = context_batch.to(device)
            optimizer.zero_grad()
            log_probs = w2v_model(center_batch)
            loss = criterion(log_probs, context_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item() * center_batch.size(0)
        avg_loss = total_loss / len(train_dataset)
        print(f"Epoch {epoch+1}, Loss: {avg_loss:.4f}")

    # 6. 获取词向量
    word_vectors = w2v_model.in_embed.weight.data.cpu().numpy()
    word_to_vec = {word: word_vectors[word_to_idx[word]] for word in vocab}

    # 7. 构建情感数据集
    train_data, val_data = train_test_split(train_df, test_size=0.1, random_state=42)
    train_dataset = SentimentDataset(train_data['Phrase'], train_data['Sentiment'], word_to_vec, embedding_dim)
    val_dataset = SentimentDataset(val_data['Phrase'], val_data['Sentiment'], word_to_vec, embedding_dim)
    test_dataset = SentimentDataset(test_df['Phrase'], None, word_to_vec, embedding_dim)

    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=64)
    test_loader = DataLoader(test_dataset, batch_size=64)

    # 8. 定义情感分类模型
    model = SentimentClassifier(embedding_dim, 5).to(device)
    loss_fn = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # 9. 训练情感分类器
    for epoch in range(10):
        model.train()
        total_loss = 0
        for x, y in tqdm(train_loader, desc=f"Sentiment Training Epoch {epoch+1}", ncols=80):
            x, y = x.to(device), torch.tensor(y).to(device)
            optimizer.zero_grad()
            out = model(x)
            loss = loss_fn(out, y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader):.4f}")

    # 10. 预测测试集并保存结果
    model.eval()
    all_preds = []
    with torch.no_grad():
        for x in tqdm(test_loader, desc="Predicting Test Set", ncols=80):
            x = x.to(device)
            out = model(x)
            preds = torch.argmax(out, dim=1)
            all_preds.extend(preds.cpu().numpy())

    submission = pd.DataFrame({
        'PhraseId': test_df['PhraseId'],
        'Sentiment': all_preds
    })
    submission.to_csv("sampleSubmission.csv", index=False)
    print("预测完成，结果已保存为 sampleSubmission.csv")

收敛了：

Epoch 7, Loss: 1.2165
Sentiment Training Epoch 8: 100%|██████████| 2195/2195 [00:03<00:00, 608.70it/s]
Epoch 8, Loss: 1.2164
Sentiment Training Epoch 9: 100%|██████████| 2195/2195 [00:03<00:00, 594.64it/s]
Epoch 9, Loss: 1.2163
Sentiment Training Epoch 10: 100%|█████████| 2195/2195 [00:04<00:00, 543.85it/s]
Epoch 10, Loss: 1.2163
Predicting Test Set: 100%|████████████████| 1036/1036 [00:00<00:00, 1111.47it/s]
预测完成，结果已保存为 sampleSubmission.csv

提交：

结果有点差劲，试试把线性分类器改为非线性的：

class SentimentClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

这里也要改：

1	model = SentimentClassifier(embedding_dim, 128, 5).to(device)

此外，感觉 Word2Vec 没有很收敛，加倍一下 epoch，

1	for epoch in range(10):

收敛了，节选一些数据：

Word2Vec Epoch 8: 100%|████████████████████| 9749/9749 [00:53<00:00, 182.00it/s]
Epoch 8, Loss: 5.2779
Word2Vec Epoch 9: 100%|████████████████████| 9749/9749 [00:57<00:00, 168.65it/s]
Epoch 9, Loss: 5.2461
Word2Vec Epoch 10: 100%|███████████████████| 9749/9749 [00:57<00:00, 170.31it/s]
Epoch 10, Loss: 5.2214

以及，

Sentiment Training Epoch 7: 100%|██████████| 2195/2195 [00:02<00:00, 966.91it/s]
Epoch 7, Loss: 1.0158
Sentiment Training Epoch 8: 100%|██████████| 2195/2195 [00:02<00:00, 967.65it/s]
Epoch 8, Loss: 1.0073
Sentiment Training Epoch 9: 100%|█████████| 2195/2195 [00:01<00:00, 1115.54it/s]
Epoch 9, Loss: 1.0002
Sentiment Training Epoch 10: 100%|█████████| 2195/2195 [00:02<00:00, 993.63it/s]
Epoch 10, Loss: 0.9938

最后结果是：

结果进步了一些。

全文完，感谢阅读😊