11 KiB
11 KiB
import pandas as pd
train = pd.read_csv("train/in.tsv.xz",header=None, compression='xz',sep="\t", names=["text","time"])
expected = pd.read_csv("train/expected.tsv", header=None)
train["expected"] = expected
train[train["expected"]==' S']["text"].str.len().describe()
count 185478.000000 mean 303.405056 std 494.328936 min 3.000000 25% 68.000000 50% 151.000000 75% 341.000000 max 10251.000000 Name: text, dtype: float64
train[train["expected"]==' P']["text"].str.len().describe()
count 104063.000000 mean 298.150995 std 504.984133 min 3.000000 25% 65.000000 50% 146.000000 75% 330.000000 max 10161.000000 Name: text, dtype: float64
import string
from nltk import word_tokenize
import nltk
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))
nltk.download("punkt")
def clean_text(text):
text = word_tokenize(text)
text = [word.lower() for word in text if word.isalpha()]
punct = str.maketrans('','',string.punctuation)
text = [word.translate(punct) for word in text]
text = [word for word in text if not word in stopwords]
return text
train['text'] = train['text'].apply(clean_text)
[nltk_data] Downloading package punkt to /home/th3niko/nltk_data... [nltk_data] Unzipping tokenizers/punkt.zip.
train['text']
0 [medical, issues, recently] 1 [supposedly, aluminum, barium, strontium, used... 2 [nobel, prizes, make, rich] 3 [came, article, stayed, doctor] 4 [resorted, insults, got, owned, directly, afte... ... 289536 [really, baby, shampoo, actually, highly, alka... 289537 [gives, example, brendan, reilly, doctor, came... 289538 [ca, fix, stupidity] 289539 [excellent, points, also, looking, bit, progra... 289540 [earlier, year, may, couple, days, ago, nov] Name: text, Length: 289541, dtype: object
from collections import Counter
def counter(text):
cnt = Counter()
for msgs in text:
for msg in msgs:
cnt[msg] += 1
return cnt
text_cnt_s = counter(train[train['expected']==' S']['text'])
text_cnt_p = counter(train[train['expected']==' P']['text'])
text_s = text_cnt_s.most_common(100)
text_p = text_cnt_p.most_common(100)
text_s = pd.DataFrame(text_s,columns = ['words','counts'])
text_p = pd.DataFrame(text_p,columns = ['words','counts'])
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
concatenated = pd.concat([text_s.assign(dataset='s'), text_p.assign(dataset='p')])
concatenated
sns.set(style="whitegrid")
g = sns.catplot(x="words", y="counts", data=concatenated,
height=6, kind="bar", palette="muted",style="dataset")
/home/th3niko/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version of pandas will change to not sort by default. To accept the future behavior, pass 'sort=False'. To retain the current behavior and silence the warning, pass 'sort=True'. """Entry point for launching an IPython kernel.
counts1 | counts2 | dataset | words1 | words2 | |
---|---|---|---|---|---|
0 | 39094.0 | NaN | s | would | NaN |
1 | 36978.0 | NaN | s | like | NaN |
2 | 36461.0 | NaN | s | people | NaN |
3 | 29143.0 | NaN | s | one | NaN |
4 | 26827.0 | NaN | s | think | NaN |
... | ... | ... | ... | ... | ... |
95 | NaN | 3007.0 | p | NaN | kind |
96 | NaN | 2990.0 | p | NaN | show |
97 | NaN | 2970.0 | p | NaN | far |
98 | NaN | 2964.0 | p | NaN | feel |
99 | NaN | 2915.0 | p | NaN | try |
200 rows × 5 columns