First Word2Vec commit
This commit is contained in:
commit
73ca11f9d1
150
Word2Vec2.ipynb
Normal file
150
Word2Vec2.ipynb
Normal file
@ -0,0 +1,150 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"\u001b[1m171/171\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 676us/step\n",
|
||||||
|
"\u001b[1m171/171\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 541us/step\n",
|
||||||
|
"Accuracy: 0.9394717534849596\n",
|
||||||
|
"Classification Report:\n",
|
||||||
|
" precision recall f1-score support\n",
|
||||||
|
"\n",
|
||||||
|
" 0 0.92 0.92 0.92 1983\n",
|
||||||
|
" 1 0.95 0.95 0.95 3469\n",
|
||||||
|
"\n",
|
||||||
|
" accuracy 0.94 5452\n",
|
||||||
|
" macro avg 0.93 0.93 0.93 5452\n",
|
||||||
|
"weighted avg 0.94 0.94 0.94 5452\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import numpy as np\n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"from gensim.models import Word2Vec\n",
|
||||||
|
"from gensim.utils import simple_preprocess\n",
|
||||||
|
"from sklearn.model_selection import train_test_split\n",
|
||||||
|
"from sklearn.metrics import accuracy_score, classification_report\n",
|
||||||
|
"import tensorflow as tf\n",
|
||||||
|
"from tensorflow.keras.models import Sequential\n",
|
||||||
|
"from tensorflow.keras.layers import Dense\n",
|
||||||
|
"\n",
|
||||||
|
"# Funkcja do przygotowania korpusu do trenowania word2vec\n",
|
||||||
|
"def prepare_corpus(filepaths):\n",
|
||||||
|
" corpus = []\n",
|
||||||
|
" for filepath in filepaths:\n",
|
||||||
|
" with open(filepath, 'r', encoding=\"utf8\") as file:\n",
|
||||||
|
" for line in file:\n",
|
||||||
|
" tokens = simple_preprocess(line)\n",
|
||||||
|
" corpus.append(tokens)\n",
|
||||||
|
" return corpus\n",
|
||||||
|
"\n",
|
||||||
|
"# Funkcja do zamiany tekstów na wektory przy użyciu word2vec\n",
|
||||||
|
"def vectorize_text(text, model):\n",
|
||||||
|
" tokens = simple_preprocess(text)\n",
|
||||||
|
" vectors = [model.wv[word] for word in tokens if word in model.wv]\n",
|
||||||
|
" if vectors:\n",
|
||||||
|
" return np.mean(vectors, axis=0)\n",
|
||||||
|
" else:\n",
|
||||||
|
" return np.zeros(model.vector_size)\n",
|
||||||
|
"\n",
|
||||||
|
"# Funkcja do wczytywania danych tekstowych\n",
|
||||||
|
"def load_data(filepath):\n",
|
||||||
|
" texts = []\n",
|
||||||
|
" with open(filepath, 'r', encoding=\"utf8\") as file:\n",
|
||||||
|
" for line in file:\n",
|
||||||
|
" texts.append(line.strip())\n",
|
||||||
|
" return texts\n",
|
||||||
|
"\n",
|
||||||
|
"# Przygotowanie korpusu i trening modelu word2vec\n",
|
||||||
|
"corpus = prepare_corpus(['dev-0/in.tsv', 'test-A/in.tsv'])\n",
|
||||||
|
"w2v_model = Word2Vec(sentences=corpus, vector_size=100, window=5, min_count=1, workers=4)\n",
|
||||||
|
"w2v_model.save(\"word2vec.model\")\n",
|
||||||
|
"\n",
|
||||||
|
"# Wczytywanie tekstów\n",
|
||||||
|
"dev_texts = load_data('dev-0/in.tsv')\n",
|
||||||
|
"test_texts = load_data('test-A/in.tsv')\n",
|
||||||
|
"\n",
|
||||||
|
"# Zamiana tekstów na wektory\n",
|
||||||
|
"dev_vectors = np.array([vectorize_text(text, w2v_model) for text in dev_texts])\n",
|
||||||
|
"test_vectors = np.array([vectorize_text(text, w2v_model) for text in test_texts])\n",
|
||||||
|
"\n",
|
||||||
|
"# Wczytywanie etykiet dla danych dev\n",
|
||||||
|
"dev_labels_df = pd.read_csv('dev-0/expected.tsv', sep='\\t', header=None)\n",
|
||||||
|
"dev_labels = dev_labels_df[0].values\n",
|
||||||
|
"\n",
|
||||||
|
"# Podział danych dev na zbiór treningowy i walidacyjny\n",
|
||||||
|
"X_train, X_val, y_train, y_val = train_test_split(dev_vectors, dev_labels, test_size=0.2, random_state=42)\n",
|
||||||
|
"\n",
|
||||||
|
"# Budowa modelu sieci neuronowej\n",
|
||||||
|
"model_nn = Sequential([\n",
|
||||||
|
" Dense(64, activation='relu'),\n",
|
||||||
|
" Dense(32, activation='relu'),\n",
|
||||||
|
" Dense(1, activation='sigmoid')\n",
|
||||||
|
"])\n",
|
||||||
|
"\n",
|
||||||
|
"model_nn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])\n",
|
||||||
|
"\n",
|
||||||
|
"# Trening modelu z walidacją\n",
|
||||||
|
"history = model_nn.fit(X_train, y_train, epochs=1000, batch_size=32, validation_data=(X_val, y_val), verbose=0)\n",
|
||||||
|
"\n",
|
||||||
|
"# Predykcje dla zbioru dev i test\n",
|
||||||
|
"dev_predictions = model_nn.predict(dev_vectors)\n",
|
||||||
|
"test_predictions = model_nn.predict(test_vectors)\n",
|
||||||
|
"\n",
|
||||||
|
"# Konwersja predykcji do binarnych klas (0 lub 1)\n",
|
||||||
|
"dev_predictions = (dev_predictions > 0.5).astype(int)\n",
|
||||||
|
"test_predictions = (test_predictions > 0.5).astype(int)\n",
|
||||||
|
"\n",
|
||||||
|
"# Zapis predykcji do plików\n",
|
||||||
|
"def save_predictions(predictions, filepath):\n",
|
||||||
|
" with open(filepath, 'w', encoding=\"utf8\") as file:\n",
|
||||||
|
" for pred in predictions:\n",
|
||||||
|
" file.write(f\"{pred[0]}\\n\")\n",
|
||||||
|
"\n",
|
||||||
|
"save_predictions(dev_predictions, 'dev-0/out.tsv')\n",
|
||||||
|
"save_predictions(test_predictions, 'test-A/out.tsv')\n",
|
||||||
|
"\n",
|
||||||
|
"# Porównanie wyników z plikiem \"expected\"\n",
|
||||||
|
"dev_pred_labels = pd.read_csv('dev-0/out.tsv', header=None).values.flatten()\n",
|
||||||
|
"expected_labels = pd.read_csv('dev-0/expected.tsv', sep='\\t', header=None).values.flatten()\n",
|
||||||
|
"\n",
|
||||||
|
"# Wyświetlenie dokładności i raportu klasyfikacji\n",
|
||||||
|
"accuracy = accuracy_score(expected_labels, dev_pred_labels)\n",
|
||||||
|
"report = classification_report(expected_labels, dev_pred_labels)\n",
|
||||||
|
"\n",
|
||||||
|
"print(f'Accuracy: {accuracy}')\n",
|
||||||
|
"print('Classification Report:')\n",
|
||||||
|
"print(report)\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.11.0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
5452
dev-0/expected.tsv
Normal file
5452
dev-0/expected.tsv
Normal file
File diff suppressed because it is too large
Load Diff
5452
dev-0/in.tsv
Normal file
5452
dev-0/in.tsv
Normal file
File diff suppressed because it is too large
Load Diff
5452
dev-0/out.tsv
Normal file
5452
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
5447
test-A/in.tsv
Normal file
5447
test-A/in.tsv
Normal file
File diff suppressed because it is too large
Load Diff
5447
test-A/out.tsv
Normal file
5447
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
98132
train/train.tsv
Normal file
98132
train/train.tsv
Normal file
File diff suppressed because it is too large
Load Diff
BIN
word2vec.model
Normal file
BIN
word2vec.model
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user