First Word2Vec commit

This commit is contained in:
AnielaWalczak 2024-05-19 19:15:33 +02:00
commit 73ca11f9d1
8 changed files with 125532 additions and 0 deletions

150
Word2Vec2.ipynb Normal file
View File

@ -0,0 +1,150 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[1m171/171\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 676us/step\n",
"\u001b[1m171/171\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 541us/step\n",
"Accuracy: 0.9394717534849596\n",
"Classification Report:\n",
" precision recall f1-score support\n",
"\n",
" 0 0.92 0.92 0.92 1983\n",
" 1 0.95 0.95 0.95 3469\n",
"\n",
" accuracy 0.94 5452\n",
" macro avg 0.93 0.93 0.93 5452\n",
"weighted avg 0.94 0.94 0.94 5452\n",
"\n"
]
}
],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from gensim.models import Word2Vec\n",
"from gensim.utils import simple_preprocess\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import accuracy_score, classification_report\n",
"import tensorflow as tf\n",
"from tensorflow.keras.models import Sequential\n",
"from tensorflow.keras.layers import Dense\n",
"\n",
"# Funkcja do przygotowania korpusu do trenowania word2vec\n",
"def prepare_corpus(filepaths):\n",
" corpus = []\n",
" for filepath in filepaths:\n",
" with open(filepath, 'r', encoding=\"utf8\") as file:\n",
" for line in file:\n",
" tokens = simple_preprocess(line)\n",
" corpus.append(tokens)\n",
" return corpus\n",
"\n",
"# Funkcja do zamiany tekstów na wektory przy użyciu word2vec\n",
"def vectorize_text(text, model):\n",
" tokens = simple_preprocess(text)\n",
" vectors = [model.wv[word] for word in tokens if word in model.wv]\n",
" if vectors:\n",
" return np.mean(vectors, axis=0)\n",
" else:\n",
" return np.zeros(model.vector_size)\n",
"\n",
"# Funkcja do wczytywania danych tekstowych\n",
"def load_data(filepath):\n",
" texts = []\n",
" with open(filepath, 'r', encoding=\"utf8\") as file:\n",
" for line in file:\n",
" texts.append(line.strip())\n",
" return texts\n",
"\n",
"# Przygotowanie korpusu i trening modelu word2vec\n",
"corpus = prepare_corpus(['dev-0/in.tsv', 'test-A/in.tsv'])\n",
"w2v_model = Word2Vec(sentences=corpus, vector_size=100, window=5, min_count=1, workers=4)\n",
"w2v_model.save(\"word2vec.model\")\n",
"\n",
"# Wczytywanie tekstów\n",
"dev_texts = load_data('dev-0/in.tsv')\n",
"test_texts = load_data('test-A/in.tsv')\n",
"\n",
"# Zamiana tekstów na wektory\n",
"dev_vectors = np.array([vectorize_text(text, w2v_model) for text in dev_texts])\n",
"test_vectors = np.array([vectorize_text(text, w2v_model) for text in test_texts])\n",
"\n",
"# Wczytywanie etykiet dla danych dev\n",
"dev_labels_df = pd.read_csv('dev-0/expected.tsv', sep='\\t', header=None)\n",
"dev_labels = dev_labels_df[0].values\n",
"\n",
"# Podział danych dev na zbiór treningowy i walidacyjny\n",
"X_train, X_val, y_train, y_val = train_test_split(dev_vectors, dev_labels, test_size=0.2, random_state=42)\n",
"\n",
"# Budowa modelu sieci neuronowej\n",
"model_nn = Sequential([\n",
" Dense(64, activation='relu'),\n",
" Dense(32, activation='relu'),\n",
" Dense(1, activation='sigmoid')\n",
"])\n",
"\n",
"model_nn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])\n",
"\n",
"# Trening modelu z walidacją\n",
"history = model_nn.fit(X_train, y_train, epochs=1000, batch_size=32, validation_data=(X_val, y_val), verbose=0)\n",
"\n",
"# Predykcje dla zbioru dev i test\n",
"dev_predictions = model_nn.predict(dev_vectors)\n",
"test_predictions = model_nn.predict(test_vectors)\n",
"\n",
"# Konwersja predykcji do binarnych klas (0 lub 1)\n",
"dev_predictions = (dev_predictions > 0.5).astype(int)\n",
"test_predictions = (test_predictions > 0.5).astype(int)\n",
"\n",
"# Zapis predykcji do plików\n",
"def save_predictions(predictions, filepath):\n",
" with open(filepath, 'w', encoding=\"utf8\") as file:\n",
" for pred in predictions:\n",
" file.write(f\"{pred[0]}\\n\")\n",
"\n",
"save_predictions(dev_predictions, 'dev-0/out.tsv')\n",
"save_predictions(test_predictions, 'test-A/out.tsv')\n",
"\n",
"# Porównanie wyników z plikiem \"expected\"\n",
"dev_pred_labels = pd.read_csv('dev-0/out.tsv', header=None).values.flatten()\n",
"expected_labels = pd.read_csv('dev-0/expected.tsv', sep='\\t', header=None).values.flatten()\n",
"\n",
"# Wyświetlenie dokładności i raportu klasyfikacji\n",
"accuracy = accuracy_score(expected_labels, dev_pred_labels)\n",
"report = classification_report(expected_labels, dev_pred_labels)\n",
"\n",
"print(f'Accuracy: {accuracy}')\n",
"print('Classification Report:')\n",
"print(report)\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

5452
dev-0/expected.tsv Normal file

File diff suppressed because it is too large Load Diff

5452
dev-0/in.tsv Normal file

File diff suppressed because it is too large Load Diff

5452
dev-0/out.tsv Normal file

File diff suppressed because it is too large Load Diff

5447
test-A/in.tsv Normal file

File diff suppressed because it is too large Load Diff

5447
test-A/out.tsv Normal file

File diff suppressed because it is too large Load Diff

98132
train/train.tsv Normal file

File diff suppressed because it is too large Load Diff

BIN
word2vec.model Normal file

Binary file not shown.