1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169
| import torch import torch.nn as nn import torch.optim as optim from torch.utils.data import Dataset, DataLoader import pandas as pd import numpy as np import nltk from nltk.tokenize import word_tokenize from sklearn.model_selection import train_test_split from tqdm import tqdm
class Word2VecDataset(Dataset): def __init__(self, pairs): self.pairs = pairs def __len__(self): return len(self.pairs) def __getitem__(self, idx): return self.pairs[idx]
class Word2Vec(nn.Module): def __init__(self, vocab_size, embedding_dim): super().__init__() self.in_embed = nn.Embedding(vocab_size, embedding_dim) self.out_embed = nn.Embedding(vocab_size, embedding_dim) def forward(self, center_idx): center_vec = self.in_embed(center_idx) scores = torch.matmul(center_vec, self.out_embed.weight.T) log_probs = nn.functional.log_softmax(scores, dim=1) return log_probs
class SentimentDataset(Dataset): def __init__(self, phrases, labels, word_to_vec, dim): self.vectors = [] self.labels = labels.values if labels is not None else None for phrase in tqdm(phrases, desc="Processing phrases for Dataset", ncols=80): tokens = word_tokenize(phrase.lower()) vectors = [word_to_vec[word] for word in tokens if word in word_to_vec] if vectors: avg_vec = np.mean(vectors, axis=0) else: avg_vec = np.zeros(dim) self.vectors.append(torch.tensor(avg_vec, dtype=torch.float32)) def __len__(self): return len(self.vectors) def __getitem__(self, idx): if self.labels is not None: return self.vectors[idx], self.labels[idx] else: return self.vectors[idx]
class SentimentClassifier(nn.Module): def __init__(self, input_dim, output_dim): super().__init__() self.fc = nn.Linear(input_dim, output_dim) def forward(self, x): return self.fc(x)
if __name__ == '__main__':
train_df = pd.read_csv("train.tsv", sep="\t") test_df = pd.read_csv("test.tsv", sep="\t")
all_phrases = pd.concat([train_df['Phrase'], test_df['Phrase']], ignore_index=True) tokenized_phrases = [word_tokenize(p.lower()) for p in all_phrases]
tokens = [word for phrase in tokenized_phrases for word in phrase] vocab = sorted(set(tokens)) word_to_idx = {word: i for i, word in enumerate(vocab)} vocab_size = len(vocab)
window_size = 2 training_pairs = [] for phrase in tqdm(tokenized_phrases, desc="Building training pairs", ncols=80): for i, center_word in enumerate(phrase): if center_word not in word_to_idx: continue center_idx = word_to_idx[center_word] for j in range(-window_size, window_size + 1): if j == 0 or i + j < 0 or i + j >= len(phrase): continue context_word = phrase[i + j] if context_word in word_to_idx: context_idx = word_to_idx[context_word] training_pairs.append((center_idx, context_idx))
embedding_dim = 100 w2v_model = Word2Vec(vocab_size, embedding_dim) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"using {device}") w2v_model.to(device)
criterion = nn.NLLLoss() optimizer = optim.Adam(w2v_model.parameters(), lr=0.001)
batch_size = 512 train_dataset = Word2VecDataset(training_pairs) train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
for epoch in range(5): w2v_model.train() total_loss = 0 for center_batch, context_batch in tqdm(train_loader, desc=f"Word2Vec Epoch {epoch+1}", ncols=80): center_batch = center_batch.to(device) context_batch = context_batch.to(device) optimizer.zero_grad() log_probs = w2v_model(center_batch) loss = criterion(log_probs, context_batch) loss.backward() optimizer.step() total_loss += loss.item() * center_batch.size(0) avg_loss = total_loss / len(train_dataset) print(f"Epoch {epoch+1}, Loss: {avg_loss:.4f}")
word_vectors = w2v_model.in_embed.weight.data.cpu().numpy() word_to_vec = {word: word_vectors[word_to_idx[word]] for word in vocab}
train_data, val_data = train_test_split(train_df, test_size=0.1, random_state=42) train_dataset = SentimentDataset(train_data['Phrase'], train_data['Sentiment'], word_to_vec, embedding_dim) val_dataset = SentimentDataset(val_data['Phrase'], val_data['Sentiment'], word_to_vec, embedding_dim) test_dataset = SentimentDataset(test_df['Phrase'], None, word_to_vec, embedding_dim)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True) val_loader = DataLoader(val_dataset, batch_size=64) test_loader = DataLoader(test_dataset, batch_size=64)
model = SentimentClassifier(embedding_dim, 5).to(device) loss_fn = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=0.001)
for epoch in range(10): model.train() total_loss = 0 for x, y in tqdm(train_loader, desc=f"Sentiment Training Epoch {epoch+1}", ncols=80): x, y = x.to(device), torch.tensor(y).to(device) optimizer.zero_grad() out = model(x) loss = loss_fn(out, y) loss.backward() optimizer.step() total_loss += loss.item() print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader):.4f}")
model.eval() all_preds = [] with torch.no_grad(): for x in tqdm(test_loader, desc="Predicting Test Set", ncols=80): x = x.to(device) out = model(x) preds = torch.argmax(out, dim=1) all_preds.extend(preds.cpu().numpy())
submission = pd.DataFrame({ 'PhraseId': test_df['PhraseId'], 'Sentiment': all_preds }) submission.to_csv("sampleSubmission.csv", index=False) print("预测完成,结果已保存为 sampleSubmission.csv")
|