Uczenie_glebokie/word2vector
2024-05-26 23:06:44 +02:00

142 lines
4.9 KiB
Plaintext

# -*- coding: utf-8 -*-
"""word2vec.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1MnKjTwfarSfbGSH71rBr5kEytE9hew5o
"""
!pip install numpy pandas gensim tensorflow
!wget https://gonito.net/get/bin/geval
!chmod u+x geval
!git clone https://git.wmi.amu.edu.pl/kubapok/sport-text-classification-ball-ISI-public.git
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pl.300.vec.gz
!gzip -d cc.pl.300.vec.gz
import gzip
import shutil
with gzip.open('sport-text-classification-ball-ISI-public/train/train.tsv.gz', 'rb') as f_in:
with open('sport-text-classification-ball-ISI-public/train/train.tsv', 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
import pandas as pd
import numpy as np
from gensim.models import KeyedVectors
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import csv
# Wczytanie danychh
def load_data(filepath):
texts = []
with open(filepath, 'r', encoding='utf-8') as file:
reader = csv.reader(file, delimiter='\t')
for row in reader:
if len(row) >= 1:
texts.append(' '.join(row))
return texts
def load_labeled_data(filepath):
texts = []
labels = []
with open(filepath, 'r', encoding='utf-8') as file:
reader = csv.reader(file, delimiter='\t')
for row in reader:
if len(row) == 2:
label, text = row
if label.isdigit():
labels.append(int(label))
texts.append(text)
else:
print(f"Ignoring line due to invalid label: {row}")
else:
print(f"Ignoring line due to incorrect number of fields: {row}")
return texts, labels
print("Starting to load labeled data")
train_texts, train_labels = load_labeled_data('sport-text-classification-ball-ISI-public/train/train.tsv')
print("Loaded training data")
dev_texts = load_data('sport-text-classification-ball-ISI-public/dev-0/in.tsv')
print("Loaded dev data")
test_texts = load_data('sport-text-classification-ball-ISI-public/test-A/in.tsv')
print("Loaded test data")
train_df = pd.DataFrame({'text': train_texts, 'label': train_labels})
dev_df = pd.DataFrame({'text': dev_texts})
test_df = pd.DataFrame({'text': test_texts})
print("Starting to load Word2Vec model")
# Wczytanie word2vec
word2vec = KeyedVectors.load_word2vec_format('cc.pl.300.vec', binary=False)
print("Loaded Word2Vec model")
# Przygotowanie tokenizera
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_texts)
print("Fitted tokenizer")
# Zamiana tekstu na sekwencje
train_sequences = tokenizer.texts_to_sequences(train_texts)
dev_sequences = tokenizer.texts_to_sequences(dev_df['text'])
test_sequences = tokenizer.texts_to_sequences(test_df['text'])
# Padding sekwencji
max_length = max(max(len(seq) for seq in train_sequences), max(len(seq) for seq in dev_sequences), max(len(seq) for seq in test_sequences))
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post')
dev_padded = pad_sequences(dev_sequences, maxlen=max_length, padding='post')
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post')
print("Padded sequences")
# Przygotowanie macierzy embedding
vocab_size = len(tokenizer.word_index) + 1
embedding_matrix = np.zeros((vocab_size, word2vec.vector_size))
for word, i in tokenizer.word_index.items():
if word in word2vec:
embedding_matrix[i] = word2vec[word]
print("Prepared embedding matrix")
# Architektura modelu
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=word2vec.vector_size, weights=[embedding_matrix], input_length=max_length, trainable=False))
model.add(Flatten())
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
print("Compiled model")
# Trenowanie
model.fit(train_padded, np.array(train_labels), epochs=10, batch_size=32, validation_split=0.2)
print("Trained model")
# Predykcja
dev_predictions = model.predict(dev_padded)
test_predictions = model.predict(test_padded)
# Zamiana wyników na format binarny
dev_predictions = (dev_predictions > 0.5).astype(int)
test_predictions = (test_predictions > 0.5).astype(int)
# Zapisanie wyników
pd.DataFrame(dev_predictions).to_csv('dev-0_out.tsv', sep='\t', header=False, index=False)
pd.DataFrame(test_predictions).to_csv('test-A_out.tsv', sep='\t', header=False, index=False)
print("Saved predictions")
# Commented out IPython magic to ensure Python compatibility.
# %cd sport-text-classification-ball-ISI-public
!../geval -t dev-0 --metric Accuracy
!../geval -t test-A --metric Accuracy
import math
accuracy = 0.91471
points = math.ceil(accuracy * 7.0)
print(f"Points: {points}")