Uczenie_glebokie/word2vector

# -*- coding: utf-8 -*-
"""word2vec.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1MnKjTwfarSfbGSH71rBr5kEytE9hew5o
"""

!pip install numpy pandas gensim tensorflow
!wget https://gonito.net/get/bin/geval
!chmod u+x geval

!git clone https://git.wmi.amu.edu.pl/kubapok/sport-text-classification-ball-ISI-public.git

!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pl.300.vec.gz
!gzip -d cc.pl.300.vec.gz

import gzip
import shutil

with gzip.open('sport-text-classification-ball-ISI-public/train/train.tsv.gz', 'rb') as f_in:
    with open('sport-text-classification-ball-ISI-public/train/train.tsv', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

import pandas as pd
import numpy as np
from gensim.models import KeyedVectors
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import csv

# Wczytanie danychh
def load_data(filepath):
    texts = []
    with open(filepath, 'r', encoding='utf-8') as file:
        reader = csv.reader(file, delimiter='\t')
        for row in reader:
            if len(row) >= 1:
                texts.append(' '.join(row))
    return texts

def load_labeled_data(filepath):
    texts = []
    labels = []
    with open(filepath, 'r', encoding='utf-8') as file:
        reader = csv.reader(file, delimiter='\t')
        for row in reader:
            if len(row) == 2:
                label, text = row
                if label.isdigit():
                    labels.append(int(label))
                    texts.append(text)
                else:
                    print(f"Ignoring line due to invalid label: {row}")
            else:
                print(f"Ignoring line due to incorrect number of fields: {row}")
    return texts, labels
print("Starting to load labeled data")


train_texts, train_labels = load_labeled_data('sport-text-classification-ball-ISI-public/train/train.tsv')
print("Loaded training data")
dev_texts = load_data('sport-text-classification-ball-ISI-public/dev-0/in.tsv')
print("Loaded dev data")
test_texts = load_data('sport-text-classification-ball-ISI-public/test-A/in.tsv')
print("Loaded test data")

train_df = pd.DataFrame({'text': train_texts, 'label': train_labels})
dev_df = pd.DataFrame({'text': dev_texts})
test_df = pd.DataFrame({'text': test_texts})
print("Starting to load Word2Vec model")

# Wczytanie word2vec
word2vec = KeyedVectors.load_word2vec_format('cc.pl.300.vec', binary=False)
print("Loaded Word2Vec model")

# Przygotowanie tokenizera
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_texts)
print("Fitted tokenizer")

# Zamiana tekstu na sekwencje
train_sequences = tokenizer.texts_to_sequences(train_texts)
dev_sequences = tokenizer.texts_to_sequences(dev_df['text'])
test_sequences = tokenizer.texts_to_sequences(test_df['text'])

# Padding sekwencji
max_length = max(max(len(seq) for seq in train_sequences), max(len(seq) for seq in dev_sequences), max(len(seq) for seq in test_sequences))
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post')
dev_padded = pad_sequences(dev_sequences, maxlen=max_length, padding='post')
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post')
print("Padded sequences")

# Przygotowanie macierzy embedding
vocab_size = len(tokenizer.word_index) + 1
embedding_matrix = np.zeros((vocab_size, word2vec.vector_size))

for word, i in tokenizer.word_index.items():
    if word in word2vec:
        embedding_matrix[i] = word2vec[word]
print("Prepared embedding matrix")

# Architektura modelu
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=word2vec.vector_size, weights=[embedding_matrix], input_length=max_length, trainable=False))
model.add(Flatten())
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
print("Compiled model")

# Trenowanie
model.fit(train_padded, np.array(train_labels), epochs=10, batch_size=32, validation_split=0.2)
print("Trained model")

# Predykcja
dev_predictions = model.predict(dev_padded)
test_predictions = model.predict(test_padded)

# Zamiana wyników na format binarny
dev_predictions = (dev_predictions > 0.5).astype(int)
test_predictions = (test_predictions > 0.5).astype(int)

# Zapisanie wyników
pd.DataFrame(dev_predictions).to_csv('dev-0_out.tsv', sep='\t', header=False, index=False)
pd.DataFrame(test_predictions).to_csv('test-A_out.tsv', sep='\t', header=False, index=False)
print("Saved predictions")

# Commented out IPython magic to ensure Python compatibility.
# %cd sport-text-classification-ball-ISI-public
!../geval -t dev-0 --metric Accuracy
!../geval -t test-A --metric Accuracy

import math

accuracy = 0.91471
points = math.ceil(accuracy * 7.0)
print(f"Points: {points}")
Add word2vector 2024-05-26 23:06:44 +02:00			`# -- coding: utf-8 --`
			`"""word2vec.ipynb`

			`Automatically generated by Colab.`

			`Original file is located at`
			`https://colab.research.google.com/drive/1MnKjTwfarSfbGSH71rBr5kEytE9hew5o`
			`"""`

			`!pip install numpy pandas gensim tensorflow`
			`!wget https://gonito.net/get/bin/geval`
			`!chmod u+x geval`

			`!git clone https://git.wmi.amu.edu.pl/kubapok/sport-text-classification-ball-ISI-public.git`

			`!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pl.300.vec.gz`
			`!gzip -d cc.pl.300.vec.gz`

			`import gzip`
			`import shutil`

			`with gzip.open('sport-text-classification-ball-ISI-public/train/train.tsv.gz', 'rb') as f_in:`
			`with open('sport-text-classification-ball-ISI-public/train/train.tsv', 'wb') as f_out:`
			`shutil.copyfileobj(f_in, f_out)`

			`import pandas as pd`
			`import numpy as np`
			`from gensim.models import KeyedVectors`
			`from tensorflow.keras.models import Sequential`
			`from tensorflow.keras.layers import Dense, Flatten, Embedding`
			`from tensorflow.keras.preprocessing.text import Tokenizer`
			`from tensorflow.keras.preprocessing.sequence import pad_sequences`
			`import csv`

			`# Wczytanie danychh`
			`def load_data(filepath):`
			`texts = []`
			`with open(filepath, 'r', encoding='utf-8') as file:`
			`reader = csv.reader(file, delimiter='\t')`
			`for row in reader:`
			`if len(row) >= 1:`
			`texts.append(' '.join(row))`
			`return texts`

			`def load_labeled_data(filepath):`
			`texts = []`
			`labels = []`
			`with open(filepath, 'r', encoding='utf-8') as file:`
			`reader = csv.reader(file, delimiter='\t')`
			`for row in reader:`
			`if len(row) == 2:`
			`label, text = row`
			`if label.isdigit():`
			`labels.append(int(label))`
			`texts.append(text)`
			`else:`
			`print(f"Ignoring line due to invalid label: {row}")`
			`else:`
			`print(f"Ignoring line due to incorrect number of fields: {row}")`
			`return texts, labels`
			`print("Starting to load labeled data")`


			`train_texts, train_labels = load_labeled_data('sport-text-classification-ball-ISI-public/train/train.tsv')`
			`print("Loaded training data")`
			`dev_texts = load_data('sport-text-classification-ball-ISI-public/dev-0/in.tsv')`
			`print("Loaded dev data")`
			`test_texts = load_data('sport-text-classification-ball-ISI-public/test-A/in.tsv')`
			`print("Loaded test data")`

			`train_df = pd.DataFrame({'text': train_texts, 'label': train_labels})`
			`dev_df = pd.DataFrame({'text': dev_texts})`
			`test_df = pd.DataFrame({'text': test_texts})`
			`print("Starting to load Word2Vec model")`

			`# Wczytanie word2vec`
			`word2vec = KeyedVectors.load_word2vec_format('cc.pl.300.vec', binary=False)`
			`print("Loaded Word2Vec model")`

			`# Przygotowanie tokenizera`
			`tokenizer = Tokenizer()`
			`tokenizer.fit_on_texts(train_texts)`
			`print("Fitted tokenizer")`

			`# Zamiana tekstu na sekwencje`
			`train_sequences = tokenizer.texts_to_sequences(train_texts)`
			`dev_sequences = tokenizer.texts_to_sequences(dev_df['text'])`
			`test_sequences = tokenizer.texts_to_sequences(test_df['text'])`

			`# Padding sekwencji`
			`max_length = max(max(len(seq) for seq in train_sequences), max(len(seq) for seq in dev_sequences), max(len(seq) for seq in test_sequences))`
			`train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post')`
			`dev_padded = pad_sequences(dev_sequences, maxlen=max_length, padding='post')`
			`test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post')`
			`print("Padded sequences")`

			`# Przygotowanie macierzy embedding`
			`vocab_size = len(tokenizer.word_index) + 1`
			`embedding_matrix = np.zeros((vocab_size, word2vec.vector_size))`

			`for word, i in tokenizer.word_index.items():`
			`if word in word2vec:`
			`embedding_matrix[i] = word2vec[word]`
			`print("Prepared embedding matrix")`

			`# Architektura modelu`
			`model = Sequential()`
			`model.add(Embedding(input_dim=vocab_size, output_dim=word2vec.vector_size, weights=[embedding_matrix], input_length=max_length, trainable=False))`
			`model.add(Flatten())`
			`model.add(Dense(10, activation='relu'))`
			`model.add(Dense(1, activation='sigmoid'))`

			`model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])`
			`print("Compiled model")`

			`# Trenowanie`
			`model.fit(train_padded, np.array(train_labels), epochs=10, batch_size=32, validation_split=0.2)`
			`print("Trained model")`

			`# Predykcja`
			`dev_predictions = model.predict(dev_padded)`
			`test_predictions = model.predict(test_padded)`

			`# Zamiana wyników na format binarny`
			`dev_predictions = (dev_predictions > 0.5).astype(int)`
			`test_predictions = (test_predictions > 0.5).astype(int)`

			`# Zapisanie wyników`
			`pd.DataFrame(dev_predictions).to_csv('dev-0_out.tsv', sep='\t', header=False, index=False)`
			`pd.DataFrame(test_predictions).to_csv('test-A_out.tsv', sep='\t', header=False, index=False)`
			`print("Saved predictions")`

			`# Commented out IPython magic to ensure Python compatibility.`
			`# %cd sport-text-classification-ball-ISI-public`
			`!../geval -t dev-0 --metric Accuracy`
			`!../geval -t test-A --metric Accuracy`

			`import math`

			`accuracy = 0.91471`
			`points = math.ceil(accuracy * 7.0)`
			`print(f"Points: {points}")`