paranormal-or-skeptic/Untitled-checkpoint.ipynb at 35d0bbd84909501f8c727ed3a6959d1e29aad2c6

import pandas as pd

train = pd.read_csv("train/in.tsv.xz",header=None, compression='xz',sep="\t", names=["text","time"])
expected = pd.read_csv("train/expected.tsv", header=None)

train["expected"] = expected

train[train["expected"]==' S']["text"].str.len().describe()

count    185478.000000
mean        303.405056
std         494.328936
min           3.000000
25%          68.000000
50%         151.000000
75%         341.000000
max       10251.000000
Name: text, dtype: float64

train[train["expected"]==' P']["text"].str.len().describe()

count    104063.000000
mean        298.150995
std         504.984133
min           3.000000
25%          65.000000
50%         146.000000
75%         330.000000
max       10161.000000
Name: text, dtype: float64

import string
from nltk import word_tokenize
import nltk
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))
nltk.download("punkt")

def clean_text(text):
    text = word_tokenize(text)
    text = [word.lower() for word in text if word.isalpha()]
    punct = str.maketrans('','',string.punctuation)
    text = [word.translate(punct) for word in text]
    text = [word for word in text if not word in stopwords]
    return text

train['text'] = train['text'].apply(clean_text)

[nltk_data] Downloading package punkt to /home/th3niko/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.

train['text']

0                               [medical, issues, recently]
1         [supposedly, aluminum, barium, strontium, used...
2                               [nobel, prizes, make, rich]
3                           [came, article, stayed, doctor]
4         [resorted, insults, got, owned, directly, afte...
                                ...                        
289536    [really, baby, shampoo, actually, highly, alka...
289537    [gives, example, brendan, reilly, doctor, came...
289538                                 [ca, fix, stupidity]
289539    [excellent, points, also, looking, bit, progra...
289540         [earlier, year, may, couple, days, ago, nov]
Name: text, Length: 289541, dtype: object

from collections import Counter
def counter(text):
    cnt = Counter()
    for msgs in text:
        for msg in msgs:
            cnt[msg] += 1
    return cnt

text_cnt_s = counter(train[train['expected']==' S']['text'])
text_cnt_p = counter(train[train['expected']==' P']['text'])

text_s = text_cnt_s.most_common(100)
text_p = text_cnt_p.most_common(100)
text_s = pd.DataFrame(text_s,columns = ['words','counts'])
text_p = pd.DataFrame(text_p,columns = ['words','counts'])

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

concatenated = pd.concat([text_s.assign(dataset='s'), text_p.assign(dataset='p')])
concatenated
sns.set(style="whitegrid")
g = sns.catplot(x="words", y="counts", data=concatenated,
                height=6, kind="bar", palette="muted",style="dataset")

/home/th3niko/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

  """Entry point for launching an IPython kernel.

	counts1	counts2	dataset	words1	words2
0	39094.0	NaN	s	would	NaN
1	36978.0	NaN	s	like	NaN
2	36461.0	NaN	s	people	NaN
3	29143.0	NaN	s	one	NaN
4	26827.0	NaN	s	think	NaN
...	...	...	...	...	...
95	NaN	3007.0	p	NaN	kind
96	NaN	2990.0	p	NaN	show
97	NaN	2970.0	p	NaN	far
98	NaN	2964.0	p	NaN	feel
99	NaN	2915.0	p	NaN	try

200 rows × 5 columns

11 KiB Raw Blame History Unescape Escape

11 KiB

Raw Blame History