Compare commits

...

No commits in common. "master" and "ISI-13-date" have entirely different histories.

17 changed files with 305504 additions and 348700 deletions

5
.gitignore vendored
View File

@ -1,8 +1,11 @@
in.tsv
model.pkl
*~
*.swp
*.bak
*.pyc
*.o
*.pkl
.DS_Store
.token
.idea

View File

@ -5,7 +5,7 @@ Classify a reddit as either from Skeptic subreddit or one of the
"paranormal" subreddits (Paranormal, UFOs, TheTruthIsHere, Ghosts,
,Glitch-in-the-Matrix, conspiracytheories).
Output label is `S` and `P`.
Output label is 0 (for skeptic) and 1 (for paranormal).
Sources
-------

View File

@ -1 +1 @@
--metric Accuracy --precision 4
--metric Accuracy --metric F1 --metric F0:N<Precision> --metric F9999999:N<Recall> --precision 4 --in-header in-header.tsv --out-header out-header.tsv

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

12
mytokenize.py Normal file
View File

@ -0,0 +1,12 @@
import re
"""
Takes a document and returns a list of tokens.
"""
def tokenize(d):
d = re.sub(r'(\s+|\\n)', ' ', d)
d = re.sub(r'(https?:|www)\S+(\s+|$)', ' URL ', d)
d = d.lower().replace(".", " .").replace(",", " ,").replace("?", " ?").replace("!", " !")
d = re.sub(r'\d+', 'NUM', d)
return re.split(r'\s+', d)

10
predict.py Normal file
View File

@ -0,0 +1,10 @@
def predict(weights, word_to_index, tokenized_text):
ypred = weights[0] # bias or w0
for x in tokenized_text:
index = word_to_index[x]
ypred += weights[index] * 1
return ypred

View File

@ -0,0 +1,83 @@
vocabulary = .... zbiór wszystkich słów ...
word_to_index_mapping = {}
index_to_word_mapping = {}
ix = 1
for w in vocabulary:
word_to_index_mapping[w] = ix
index_to_word_mapping[ix] = w
ix += 1
#inicjalizacja
weights = []
for ix in xrange(0, len(vocabulary)+1):
weights[ix] = losowa wartość z przedziału (-0.01, 0.01)
Albo weights[0] = 2012.0
learning_rate = 0.000001
Loss_sum = 0.0
Loss_sum_counter = 0
while ....
d, y = losowy przykład ze zbioru uczący
# predykcja
y_hat = weights[0]
dla każdego słowa w z dokumentu d:
y_hat += weights[word_to_index_mapping[w]] * (liczba wystąpień w w d)
# funkcja kosztu
Loss = (y_hat y)**2.0
Loss_sum += Loss
If loss_counter % 1000 == 0:
print(loss_sum / 1000)
Loss_counter = 0
Loss_sum = 0.0
# uczenie - update
delta = (y_hat - y) * learning_rate
weights[0] = weights[0] - delta
for każdego słowa w z dokumenty d:
weights[word_to_index_mapping[w]] -= (liczba wystąpień w w d) * delta

1
score.txt Normal file
View File

@ -0,0 +1 @@
MSE: 373.4577154450468

View File

@ -1,15 +0,0 @@
0.6920
0.6857
0.6969
0.6931
0.6927
0.6952
0.6969
0.6969
0.6959
0.6959
0.6965
0.6965
0.6965
0.6954
0.6965

View File

@ -1,20 +1,126 @@
import re
import sys
import pandas as pd
import pickle
import csv
from random import uniform
for line in sys.stdin:
if re.search(r'UFO|paranormal|UFOs|video|night|house|saw|camera|lights|light|alien|aliens|ghost|object|dream|sky|room|ufo|craft|happened|sightings|footage|dreams|sleep', line):
print("P")
from predict import predict
from mytokenize import tokenize
#load data:
train = pd.read_csv("train/in.tsv", delimiter="\t", header=None, names=["text","date"], quoting=csv.QUOTE_NONE)
texts = train["text"]
y = train["date"]
temp_y = []
for posix_time in y:
floatyear = 1970 + (posix_time / (60*60*24 * 365.25))
temp_y.append(floatyear)
y = temp_y
print(y[0])
tokenized_texts = []
word_to_index = {}
index_to_word = {}
word_count = {}
learning_rate = 0.000001
for doc in texts:
tokens = tokenize(doc)
tokenized_texts.append(tokens)
for token in tokens:
if token in word_count:
word_count[token] += 1
else:
word_count[token] = 1
#vocabulary sorted from the most frequent words to the least f. w.
vocabulary = sorted(word_count, key= word_count.get, reverse=True)
for w in vocabulary:
i = len(word_to_index) + 1
word_to_index[w] = i
index_to_word[i] = w
weights = []
for i in range(0, len(vocabulary) + 1):
weights.append(2012 * uniform(-0.01, 0.01))
best_MSE = 408
model = pickle.load(open("BestLinearRegressionModel.pkl", "rb"))
word_to_index, weights, best_MSE = model
print("Unpickled best model.", "\tbest_MSE: ", best_MSE)
loss_sum = 0.0
loss_counter = 0
for_MSE_sum = 0.0
MSE = 0.0
counter = 0
inc_counter = 0
while(True):
for i in range(0, len(tokenized_texts)):
"""#@@ Obliczanie ypred - start
ypred = weights[0]#bias or w0
for x in tokenized_texts[i]:
index = word_to_index[x]
ypred += weights[index] * index
#@@ Obliczanie ypred - end"""
ypred = predict(weights, word_to_index, tokenized_texts[i])
#@@ Obliczanie kosztu
loss = (ypred - y[i]) ** 2.0
loss_sum += loss
for_MSE_sum += loss
loss_counter += 1
if loss_counter == 1000:
#print(str(loss_sum/1000), "\t", str(MSE))
loss_counter = 0
loss_sum = 0.0
#@@ Obliczanie kosztu - end
#@@ Uczenie - aktualizacja wag
delta = (ypred - y[i]) * learning_rate
weights[0] -= delta
for x in tokenized_texts[i]:
index = word_to_index[x]
weights[index] -= delta
#@@ Uczenie - aktualizacja wag - end
temp_MSE = for_MSE_sum / len(tokenized_texts)
for_MSE_sum = 0.0
if best_MSE > temp_MSE:
best_MSE = temp_MSE
model = (word_to_index, weights, best_MSE)
pickle.dump(model, open("BestLinearRegressionModel.pkl", "wb"))
with open("score.txt", "w") as out:
out.write("MSE:\t")
out.write(str(best_MSE))
if temp_MSE > MSE:
counter += 1
else:
print("S")
inc_counter += 1
if counter > 2:
learning_rate *= 0.1
counter = 0
if inc_counter > 4:
learning_rate /= 0.90
inc_counter = 0
MSE = temp_MSE
print("MSE: " "\t", "%10.10f" % MSE, "\tLearningRate:\t", "%10.10f" % float(learning_rate))
"""
with open("dev-0/out.tsv", "w") as out1:
for line in predicted_dev0:
out1.write(str(line))
out1.write("\n")
happened|sightings|footage|dreams|sleep|videos|experiences|weird|objects|flying|strange|ET|photo|moving|fake|sighting|door|ghosts|looks|bed|spirits|paralysis|pictures|glitch|shadow|picture|space|photos|looked|phenomena|contact|spirit|stories|phenomenon|window|ufos|haunted|lol|creepy|lanterns|dark|scared|cameras|balloon|seen|beings|disclosure|story
"""
with open("test-A/out.tsv", "w") as out2:
for line in predicted_testA:
out2.write(str(line))
out2.write("\n")
"""

View File

@ -1,4 +0,0 @@
xzcat dev-0/in.tsv.xz | python3 solution.py > dev-0/out.tsv
xzcat test-A/in.tsv.xz | python3 solution.py > test-A/out.tsv
geval -t dev-0 >>scores.txt

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +0,0 @@
xzcat in.tsv.xz | paste expected.tsv - | egrep -o '^ P.*'| egrep -o '[[:alpha:]]+' | sort > sortedP
xzcat in.tsv.xz | paste expected.tsv - | egrep -o '^ S.*'| egrep -o '[[:alpha:]]+' | sort > sortedS
comm -23 sortedP sortedS > PsetsubtractionS
cat PsetsubtractionS | uniq -c | sort -nr > PsetsubtractionS_counted.txt