projekt-uczenie/projekt.ipynb
2022-06-18 14:48:44 +02:00

5.0 KiB

import lzma
import pandas as pd
import numpy as np
import gzip
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression

def readFile(filename):
    X_dev = []
    with open(filename, 'r', encoding="utf-8") as dev_in:
        for line in dev_in:
            text = line.split("\t")[0].strip()
            X_dev.append(text)
    return X_dev
    
def writePred(filename, predictions):
    with open(filename, "w") as out_file:
        for pred in predictions:
            out_file.write(str(pred) + "\n")

with gzip.open('train.tsv.gz', 'rb') as f:
    data = pd.read_csv(f, sep='\t',error_bad_lines=False,names=['isBall','text'])

x = data['text']
y = data['isBall']

x = np.asarray(x)
y = np.asarray(y)

model = make_pipeline(TfidfVectorizer(), MultinomialNB())
model.fit(x,y)

dev = readFile('in.tsv')
pred = model.predict(dev)
trueClass = readFile('expected.tsv')

trueClass = [int(x) for x in trueClass]
print('Naiwny bayes:')
print("recal score = ",recall_score(trueClass,list(pred)))
print("acuracy score = ",accuracy_score(trueClass,list(pred)))
print("precision score = ",precision_score(trueClass,list(pred)))
print("f score score = ",f1_score(trueClass,list(pred)))

#x = x[:50000]
#y = y[:50000]
model = make_pipeline(TfidfVectorizer(),LogisticRegression())
model.fit(x,y)
pred = model.predict(dev)

print('\nRegresja logistyczna:')
print("recal score = ",recall_score(trueClass,list(pred)))
print("acuracy score = ",accuracy_score(trueClass,list(pred)))
print("precision score = ",precision_score(trueClass,list(pred)))
print("f score score = ",f1_score(trueClass,list(pred)))
C:\Users\szymo\AppData\Local\Temp\ipykernel_17472\3947148253.py:28: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.


  data = pd.read_csv(f, sep='\t',error_bad_lines=False,names=['isBall','text'])
b'Skipping line 25706: expected 2 fields, saw 3\nSkipping line 58881: expected 2 fields, saw 3\nSkipping line 73761: expected 2 fields, saw 3\n'
Naiwny bayes:
recal score =  0.9939463822427212
acuracy score =  0.9889948642699926
precision score =  0.9888156008029825
f score score =  0.9913743530764807
c:\Users\szymo\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py:814: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Regresja logistyczna:
recal score =  0.9979821274142404
acuracy score =  0.9891782831988262
precision score =  0.9852020489470689
f score score =  0.9915509093512818