zadanie warsztaty
This commit is contained in:
commit
7dd0d160e1
133
script.py
Normal file
133
script.py
Normal file
@ -0,0 +1,133 @@
|
|||||||
|
import pandas as pd # our main data management package
|
||||||
|
import matplotlib.pyplot as plt # our main display package
|
||||||
|
import string # used for preprocessing
|
||||||
|
import re # used for preprocessing
|
||||||
|
import nltk # the Natural Language Toolkit, used for preprocessing
|
||||||
|
import numpy as np # used for managing NaNs
|
||||||
|
import nltk
|
||||||
|
from prefect import task, Flow, Parameter
|
||||||
|
from nltk.tokenize import word_tokenize
|
||||||
|
from nltk.corpus import stopwords # used for preprocessing
|
||||||
|
from nltk.stem import WordNetLemmatizer # used for preprocessing
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
from sklearn.linear_model import LogisticRegression # our model
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
nltk.download('stopwords')
|
||||||
|
nltk.download('wordnet')
|
||||||
|
nltk.download('punkt')
|
||||||
|
|
||||||
|
train = pd.read_csv('train.csv')
|
||||||
|
train_count=train.count()
|
||||||
|
train = train.drop(['keyword', 'location'], axis = 1)
|
||||||
|
|
||||||
|
test = pd.read_csv('test.csv')
|
||||||
|
|
||||||
|
stop_words = set(stopwords.words('english'))
|
||||||
|
lemmatizer = WordNetLemmatizer()
|
||||||
|
|
||||||
|
pp_text_train = [] # our preprocessed text column
|
||||||
|
pp_text_test = [] # our preprocessed text column
|
||||||
|
|
||||||
|
|
||||||
|
@task(log_stdout=True)
|
||||||
|
def describe(file):
|
||||||
|
file.describe()
|
||||||
|
|
||||||
|
with Flow("Prepare") as prepare:
|
||||||
|
file = Parameter('file')
|
||||||
|
describe(file)
|
||||||
|
|
||||||
|
prepare.run(file=train)
|
||||||
|
prepare.run(file=test)
|
||||||
|
|
||||||
|
|
||||||
|
# remove urls, handles, and the hashtag from hashtags (taken from https://stackoverflow.com/questions/8376691/how-to-remove-hashtag-user-link-of-a-tweet-using-regular-expression)
|
||||||
|
def remove_urls(text):
|
||||||
|
new_text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",text).split())
|
||||||
|
return new_text
|
||||||
|
|
||||||
|
# make all text lowercase
|
||||||
|
def text_lowercase(text):
|
||||||
|
return text.lower()
|
||||||
|
|
||||||
|
# remove numbers
|
||||||
|
def remove_numbers(text):
|
||||||
|
result = re.sub(r'\d+', '', text)
|
||||||
|
return result
|
||||||
|
|
||||||
|
# remove punctuation
|
||||||
|
def remove_punctuation(text):
|
||||||
|
translator = str.maketrans('', '', string.punctuation)
|
||||||
|
return text.translate(translator)
|
||||||
|
|
||||||
|
# tokenize
|
||||||
|
def tokenize(text):
|
||||||
|
text = word_tokenize(text)
|
||||||
|
return text
|
||||||
|
|
||||||
|
# remove stopwords
|
||||||
|
def remove_stopwords(text):
|
||||||
|
text = [i for i in text if not i in stop_words]
|
||||||
|
return text
|
||||||
|
|
||||||
|
# lemmatize
|
||||||
|
def lemmatize(text):
|
||||||
|
text = [lemmatizer.lemmatize(token) for token in text]
|
||||||
|
return text
|
||||||
|
|
||||||
|
def preprocessing(text):
|
||||||
|
text = text_lowercase(text)
|
||||||
|
text = remove_urls(text)
|
||||||
|
text = remove_numbers(text)
|
||||||
|
text = remove_punctuation(text)
|
||||||
|
text = tokenize(text)
|
||||||
|
text = remove_stopwords(text)
|
||||||
|
text = lemmatize(text)
|
||||||
|
text = ' '.join(text)
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@task(log_stdout=True)
|
||||||
|
def preprocess(file, pp_text):
|
||||||
|
for text_data in file['text']:
|
||||||
|
pp_text_data = preprocessing(text_data)
|
||||||
|
pp_text.append(pp_text_data)
|
||||||
|
file['pp_text'] = pp_text # add the preprocessed text as a column
|
||||||
|
|
||||||
|
|
||||||
|
with Flow("Preprocess") as preprocessData:
|
||||||
|
file = Parameter('file')
|
||||||
|
pp_text = Parameter('pp_text')
|
||||||
|
preprocess(file, pp_text)
|
||||||
|
|
||||||
|
preprocessData.run(file=train, pp_text=pp_text_train)
|
||||||
|
preprocessData.run(file=test, pp_text=pp_text_test)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
train_text_data = list(train['pp_text'])
|
||||||
|
test_text_data = list(test['pp_text'])
|
||||||
|
corpus = train_text_data + test_text_data
|
||||||
|
tf=TfidfVectorizer()
|
||||||
|
fitted_vectorizer = tf.fit(corpus)
|
||||||
|
|
||||||
|
# train
|
||||||
|
train_transform = fitted_vectorizer.transform(train['pp_text'])
|
||||||
|
y = train['target']
|
||||||
|
# test
|
||||||
|
test_transform = fitted_vectorizer.transform(test['pp_text'])
|
||||||
|
|
||||||
|
X=train_transform
|
||||||
|
print(X)
|
||||||
|
X_train, X_test, y_train, y_test = train_test_split(X, y)
|
||||||
|
|
||||||
|
scikit_log_reg = LogisticRegression()
|
||||||
|
model=scikit_log_reg.fit(X_train, y_train)
|
||||||
|
predictions = model.predict(X_test)
|
||||||
|
count = 0
|
||||||
|
for guess, answer in zip(predictions, y_test):
|
||||||
|
if guess == answer:
|
||||||
|
count += 1
|
||||||
|
print(count/len(y_test))
|
||||||
|
|
Loading…
Reference in New Issue
Block a user