import os import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.utils import class_weight from gensim.models import KeyedVectors from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense, Dropout from tensorflow.keras.optimizers import Adam import sys # Ustawienie kodowania na utf-8 sys.stdout.reconfigure(encoding='utf-8') # Funkcja do ładowania danych def load_and_prepare_data(file_path, text_column='text', label_column='label', is_test=False): if is_test: df = pd.read_csv(file_path, sep='\t', header=None, names=[text_column], on_bad_lines='skip') df['label'] = 0 # Dodanie kolumny label z wartością domyślną else: df = pd.read_csv(file_path, sep='\t', header=None, names=[label_column, text_column], on_bad_lines='skip') df[text_column].fillna('', inplace=True) return df # 1. Przygotowanie danych train_df = load_and_prepare_data('./train/train.tsv') dev_df = load_and_prepare_data('./dev-0/in.tsv', is_test=True) test_df = load_and_prepare_data('./test-A/in.tsv', is_test=True) dev_labels = pd.read_csv('./dev-0/expected.tsv', sep='\t', header=None, names=['label']) # 2. Przygotowanie modelu word2vec word2vec_path = './word2vec_100_3_polish.bin' w2v_model = KeyedVectors.load(word2vec_path) # 3. Przekształcenie tekstów na wektory def text_to_vector(text, model_w2v, vector_size=100): if not isinstance(text, str): return np.zeros((vector_size,)) words = text.split() words = [word for word in words if word in model_w2v] if words: return np.mean(model_w2v[words], axis=0) else: return np.zeros((vector_size,)) # 4. Budowa i trenowanie modelu def build_and_train_model(X_train, y_train): model = Sequential([ Dense(256, activation='relu', input_dim=100), Dropout(0.3), Dense(128, activation='relu'), Dropout(0.3), Dense(64, activation='relu'), Dropout(0.3), Dense(32, activation='relu'), Dropout(0.3), Dense(1, activation='sigmoid') ]) model.compile(optimizer=Adam(learning_rate=0.0005), loss='binary_crossentropy', metrics=['accuracy']) weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train) weights_dict = dict(enumerate(weights)) model.fit(X_train, y_train, epochs=50, batch_size=32, class_weight=weights_dict, validation_split=0.2) return model # 5. Generowanie wyników def generate_results(df, model, filepath, text_column='text'): X_test = np.array([text_to_vector(text, w2v_model) for text in df[text_column]]) predictions = (model.predict(X_test) > 0.5).astype(int) pd.DataFrame(predictions).to_csv(filepath, sep='\t', index=False, header=False) # Przekształcenie tekstów na wektory X_train = np.array([text_to_vector(text, w2v_model) for text in train_df['text']]) y_train = train_df['label'].values # Trenowanie modelu model = build_and_train_model(X_train, y_train) # Generowanie wyników dla zbioru walidacyjnego i testowego generate_results(dev_df, model, "./dev-0/out.tsv") generate_results(test_df, model, "./test-A/out.tsv")