zadanie warsztaty

2020-06-26 13:25:29 +02:00 · 2020-06-26 13:25:29 +02:00 · 7dd0d160e1
commit 7dd0d160e1
3 changed files with 12395 additions and 0 deletions
--- a/script.py
+++ b/script.py
@ -0,0 +1,133 @@
 import pandas as pd # our main data management package
 import matplotlib.pyplot as plt # our main display package
 import string # used for preprocessing
 import re # used for preprocessing
 import nltk # the Natural Language Toolkit, used for preprocessing
 import numpy as np # used for managing NaNs
 import nltk
 from prefect import task, Flow, Parameter
 from nltk.tokenize import word_tokenize
 from nltk.corpus import stopwords # used for preprocessing
 from nltk.stem import WordNetLemmatizer # used for preprocessing
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.linear_model import LogisticRegression # our model
 from sklearn.model_selection import train_test_split
 nltk.download('stopwords')
 nltk.download('wordnet')
 nltk.download('punkt')
 train = pd.read_csv('train.csv')
 train_count=train.count()
 train = train.drop(['keyword', 'location'], axis = 1)
 test = pd.read_csv('test.csv')
 stop_words = set(stopwords.words('english'))
 lemmatizer = WordNetLemmatizer()
 pp_text_train = [] # our preprocessed text column
 pp_text_test = [] # our preprocessed text column
@task(log_stdout=True)
 def describe(file):
 	file.describe()
 with Flow("Prepare") as prepare:
 	file = Parameter('file')
 	describe(file)
 prepare.run(file=train)
 prepare.run(file=test)
 # remove urls, handles, and the hashtag from hashtags (taken from https://stackoverflow.com/questions/8376691/how-to-remove-hashtag-user-link-of-a-tweet-using-regular-expression)
 def remove_urls(text):
 	new_text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",text).split())
 	return new_text
 # make all text lowercase
 def text_lowercase(text):
 	return text.lower()
 # remove numbers
 def remove_numbers(text):
 	result = re.sub(r'\d+', '', text)
 	return result
 # remove punctuation
 def remove_punctuation(text):
 	translator = str.maketrans('', '', string.punctuation)
 	return text.translate(translator)
 # tokenize
 def tokenize(text):
 	text = word_tokenize(text)
 	return text
 # remove stopwords
 def remove_stopwords(text):
 	text = [i for i in text if not i in stop_words]
 	return text
 # lemmatize
 def lemmatize(text):
 	text = [lemmatizer.lemmatize(token) for token in text]
 	return text
 def preprocessing(text):
 	text = text_lowercase(text)
 	text = remove_urls(text)
 	text = remove_numbers(text)
 	text = remove_punctuation(text)
 	text = tokenize(text)
 	text = remove_stopwords(text)
 	text = lemmatize(text)
 	text = ' '.join(text)
 	return text	
@task(log_stdout=True)
 def preprocess(file, pp_text):
 	for text_data in file['text']:
 		pp_text_data = preprocessing(text_data)
 		pp_text.append(pp_text_data)
 	file['pp_text'] = pp_text # add the preprocessed text as a column	
 with Flow("Preprocess") as preprocessData:
 	file = Parameter('file')
 	pp_text = Parameter('pp_text')
 	preprocess(file, pp_text)
 preprocessData.run(file=train, pp_text=pp_text_train)
 preprocessData.run(file=test, pp_text=pp_text_test)
 train_text_data = list(train['pp_text'])
 test_text_data = list(test['pp_text'])
 corpus = train_text_data + test_text_data
 tf=TfidfVectorizer()
 fitted_vectorizer = tf.fit(corpus)
 # train
 train_transform = fitted_vectorizer.transform(train['pp_text'])
 y = train['target']
 # test
 test_transform = fitted_vectorizer.transform(test['pp_text'])
 X=train_transform
 print(X)
 X_train, X_test, y_train, y_test = train_test_split(X, y)
 scikit_log_reg = LogisticRegression()
 model=scikit_log_reg.fit(X_train, y_train)
 predictions = model.predict(X_test)
 count = 0
 for guess, answer in zip(predictions, y_test):
 	if guess == answer:
 		count += 1
 print(count/len(y_test))
--- a/test.csv
+++ b/test.csv
--- a/train.csv
+++ b/train.csv