warsztaty-prefect/main.py

import pandas as pd
import matplotlib.pyplot as plt
import string
import re
import nltk
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import train_test_split

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train.head()
train_count=train.count()
print(train_count)
print(train_count/train_count[0]*100)
train = train.drop(['keyword', 'location'], axis = 1)

test.head()
test.describe()

# remove urls, handles, and the hashtag from hashtags (taken from https://stackoverflow.com/questions/8376691/how-to-remove-hashtag-user-link-of-a-tweet-using-regular-expression)
def remove_urls(text):
    new_text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",text).split())
    return new_text

# make all text lowercase
def text_lowercase(text):
    return text.lower()

# remove numbers
def remove_numbers(text):
    result = re.sub(r'\d+', '', text)
    return result

# remove punctuation
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

# tokenize
def tokenize(text):
    text = word_tokenize(text)
    return text

# remove stopwords
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    text = [i for i in text if not i in stop_words]
    return text
    
# lemmatize
lemmatizer = WordNetLemmatizer()
def lemmatize(text):
    text = [lemmatizer.lemmatize(token) for token in text]
    return text

def preprocessing(text):
    text = text_lowercase(text)
    text = remove_urls(text)
    text = remove_numbers(text)
    text = remove_punctuation(text)
    text = tokenize(text)
    text = remove_stopwords(text)
    text = lemmatize(text)
    text = ' '.join(text)
    return text


pp_text_train = [] # our preprocessed text column
for text_data in train['text']:
    pp_text_data = preprocessing(text_data)
    pp_text_train.append(pp_text_data)
train['pp_text'] = pp_text_train # add the preprocessed text as a column

pp_text_test = [] # our preprocessed text column
for text_data in test['text']:
    pp_text_data = preprocessing(text_data)
    pp_text_test.append(pp_text_data)
test['pp_text'] = pp_text_test # add the preprocessed text as a column

train_text_data = list(train['pp_text'])
test_text_data = list(test['pp_text'])
corpus = train_text_data + test_text_data

tf=TfidfVectorizer()
# the vectorizer must be fit onto the entire corpus
fitted_vectorizer = tf.fit(corpus)

# train
train_transform = fitted_vectorizer.transform(train['pp_text'])
y = train['target']
# test
test_transform = fitted_vectorizer.transform(test['pp_text'])

X=train_transform
X_train, X_test, y_train, y_test = train_test_split(X, y)

scikit_log_reg = LogisticRegression()
model=scikit_log_reg.fit(X_train, y_train)

predictions = model.predict(X_test)
count = 0
for guess, answer in zip(predictions, y_test):
    if guess == answer:
        count += 1
print(count/len(y_test))
add base project 2020-06-13 23:03:53 +02:00			`import pandas as pd`
			`import matplotlib.pyplot as plt`
			`import string`
			`import re`
			`import nltk`
			`import numpy as np`
			`import nltk`
			`from nltk.tokenize import word_tokenize`
			`from nltk.corpus import stopwords`
			`from nltk.stem import WordNetLemmatizer`
			`from sklearn.feature_extraction.text import TfidfVectorizer`
			`from sklearn.linear_model import LogisticRegression`
			`from sklearn.model_selection import train_test_split`

			`nltk.download('stopwords')`
			`nltk.download('wordnet')`
			`nltk.download('punkt')`

			`train = pd.read_csv('train.csv')`
			`test = pd.read_csv('test.csv')`

			`train.head()`
			`train_count=train.count()`
			`print(train_count)`
			`print(train_count/train_count[0]*100)`
			`train = train.drop(['keyword', 'location'], axis = 1)`

			`test.head()`
			`test.describe()`

			`# remove urls, handles, and the hashtag from hashtags (taken from https://stackoverflow.com/questions/8376691/how-to-remove-hashtag-user-link-of-a-tweet-using-regular-expression)`
			`def remove_urls(text):`
			`new_text = ' '.join(re.sub("(@[A-Za-z0-9]+)\|([^0-9A-Za-z \t])\|(\w+:\/\/\S+)"," ",text).split())`
			`return new_text`

			`# make all text lowercase`
			`def text_lowercase(text):`
			`return text.lower()`

			`# remove numbers`
			`def remove_numbers(text):`
			`result = re.sub(r'\d+', '', text)`
			`return result`

			`# remove punctuation`
			`def remove_punctuation(text):`
			`translator = str.maketrans('', '', string.punctuation)`
			`return text.translate(translator)`

			`# tokenize`
			`def tokenize(text):`
			`text = word_tokenize(text)`
			`return text`

			`# remove stopwords`
			`stop_words = set(stopwords.words('english'))`
			`def remove_stopwords(text):`
			`text = [i for i in text if not i in stop_words]`
			`return text`

			`# lemmatize`
			`lemmatizer = WordNetLemmatizer()`
			`def lemmatize(text):`
			`text = [lemmatizer.lemmatize(token) for token in text]`
			`return text`

			`def preprocessing(text):`
			`text = text_lowercase(text)`
			`text = remove_urls(text)`
			`text = remove_numbers(text)`
			`text = remove_punctuation(text)`
			`text = tokenize(text)`
			`text = remove_stopwords(text)`
			`text = lemmatize(text)`
			`text = ' '.join(text)`
			`return text`


			`pp_text_train = [] # our preprocessed text column`
			`for text_data in train['text']:`
			`pp_text_data = preprocessing(text_data)`
			`pp_text_train.append(pp_text_data)`
			`train['pp_text'] = pp_text_train # add the preprocessed text as a column`

			`pp_text_test = [] # our preprocessed text column`
			`for text_data in test['text']:`
			`pp_text_data = preprocessing(text_data)`
			`pp_text_test.append(pp_text_data)`
			`test['pp_text'] = pp_text_test # add the preprocessed text as a column`

			`train_text_data = list(train['pp_text'])`
			`test_text_data = list(test['pp_text'])`
			`corpus = train_text_data + test_text_data`

			`tf=TfidfVectorizer()`
			`# the vectorizer must be fit onto the entire corpus`
			`fitted_vectorizer = tf.fit(corpus)`

			`# train`
			`train_transform = fitted_vectorizer.transform(train['pp_text'])`
			`y = train['target']`
			`# test`
			`test_transform = fitted_vectorizer.transform(test['pp_text'])`

			`X=train_transform`
			`X_train, X_test, y_train, y_test = train_test_split(X, y)`

			`scikit_log_reg = LogisticRegression()`
			`model=scikit_log_reg.fit(X_train, y_train)`

			`predictions = model.predict(X_test)`
			`count = 0`
			`for guess, answer in zip(predictions, y_test):`
			`if guess == answer:`
			`count += 1`
			`print(count/len(y_test))`