Compare commits
No commits in common. "ISI-13" and "master" have entirely different histories.
5
.gitignore
vendored
5
.gitignore
vendored
@ -1,11 +1,8 @@
|
||||
in.tsv
|
||||
model.pkl
|
||||
|
||||
*~
|
||||
*.swp
|
||||
*.bak
|
||||
*.pyc
|
||||
*.o
|
||||
|
||||
.DS_Store
|
||||
.token
|
||||
.idea
|
Binary file not shown.
@ -5,7 +5,7 @@ Classify a reddit as either from Skeptic subreddit or one of the
|
||||
"paranormal" subreddits (Paranormal, UFOs, TheTruthIsHere, Ghosts,
|
||||
,Glitch-in-the-Matrix, conspiracytheories).
|
||||
|
||||
Output label is 0 (for skeptic) and 1 (for paranormal).
|
||||
Output label is `S` and `P`.
|
||||
|
||||
Sources
|
||||
-------
|
||||
|
@ -1 +1 @@
|
||||
--metric Accuracy --metric F1 --metric F0:N<Precision> --metric F9999999:N<Recall> --precision 4 --in-header in-header.tsv --out-header out-header.tsv
|
||||
--metric Accuracy --precision 4
|
||||
|
10544
dev-0/expected.tsv
10544
dev-0/expected.tsv
File diff suppressed because it is too large
Load Diff
10544
dev-0/out.tsv
10544
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
@ -1,12 +0,0 @@
|
||||
import re
|
||||
|
||||
"""
|
||||
Takes a document and returns a list of tokens.
|
||||
"""
|
||||
def tokenize(d):
|
||||
d = re.sub(r'(\s+|\\n)', ' ', d)
|
||||
d = re.sub(r'(https?:|www)\S+(\s+|$)', ' URL ', d)
|
||||
d = d.lower().replace(".", " .").replace(",", " ,").replace("?", " ?").replace("!", " !")
|
||||
d = re.sub(r'\d+', 'NUM', d)
|
||||
|
||||
return re.split(r'\s+', d)
|
13
predict.py
13
predict.py
@ -1,13 +0,0 @@
|
||||
|
||||
|
||||
|
||||
|
||||
def predict(weights, word_to_index, tokenized_text):
|
||||
ypred = weights[0] # bias or w0
|
||||
for x in tokenized_text:
|
||||
if x in word_to_index:
|
||||
index = word_to_index[x]
|
||||
ypred += weights[index] * 1
|
||||
return ypred
|
||||
|
||||
|
@ -1,20 +0,0 @@
|
||||
import pickle
|
||||
from predict import predict
|
||||
from mytokenize import tokenize
|
||||
import sys
|
||||
|
||||
|
||||
|
||||
path_pkl ="BestLinearRegressionModel.pkl"
|
||||
model = pickle.load(open(path_pkl, "rb"))
|
||||
word_to_index, weights, best_MSE = model
|
||||
|
||||
for line in sys.stdin:
|
||||
text, date = line.split("\t")
|
||||
tokenized_text = tokenize(text)
|
||||
ypred = predict(weights, word_to_index, tokenized_text)
|
||||
if ypred < 0.5:
|
||||
print("0")
|
||||
else:
|
||||
print("1")
|
||||
|
@ -1,83 +0,0 @@
|
||||
vocabulary = .... zbiór wszystkich słów ...
|
||||
|
||||
word_to_index_mapping = {}
|
||||
|
||||
index_to_word_mapping = {}
|
||||
|
||||
ix = 1
|
||||
|
||||
for w in vocabulary:
|
||||
|
||||
word_to_index_mapping[w] = ix
|
||||
|
||||
index_to_word_mapping[ix] = w
|
||||
|
||||
ix += 1
|
||||
|
||||
|
||||
|
||||
#inicjalizacja
|
||||
|
||||
weights = []
|
||||
|
||||
for ix in xrange(0, len(vocabulary)+1):
|
||||
|
||||
weights[ix] = losowa wartość z przedziału (-0.01, 0.01)
|
||||
|
||||
|
||||
|
||||
Albo weights[0] = 2012.0
|
||||
|
||||
|
||||
|
||||
learning_rate = 0.000001
|
||||
|
||||
|
||||
|
||||
Loss_sum = 0.0
|
||||
|
||||
Loss_sum_counter = 0
|
||||
|
||||
while ....
|
||||
|
||||
d, y = losowy przykład ze zbioru uczący
|
||||
|
||||
|
||||
|
||||
# predykcja
|
||||
|
||||
y_hat = weights[0]
|
||||
|
||||
dla każdego słowa w z dokumentu d:
|
||||
|
||||
y_hat += weights[word_to_index_mapping[w]] * (liczba wystąpień w w d)
|
||||
|
||||
|
||||
|
||||
# funkcja kosztu
|
||||
|
||||
Loss = (y_hat – y)**2.0
|
||||
|
||||
Loss_sum += Loss
|
||||
|
||||
If loss_counter % 1000 == 0:
|
||||
|
||||
print(loss_sum / 1000)
|
||||
|
||||
Loss_counter = 0
|
||||
|
||||
Loss_sum = 0.0
|
||||
|
||||
|
||||
|
||||
# uczenie - update
|
||||
|
||||
delta = (y_hat - y) * learning_rate
|
||||
|
||||
weights[0] = weights[0] - delta
|
||||
|
||||
for każdego słowa w z dokumenty d:
|
||||
|
||||
weights[word_to_index_mapping[w]] -= (liczba wystąpień w w d) * delta
|
||||
|
||||
|
15
scores.txt
Normal file
15
scores.txt
Normal file
@ -0,0 +1,15 @@
|
||||
0.6920
|
||||
0.6857
|
||||
0.6969
|
||||
0.6931
|
||||
0.6927
|
||||
0.6952
|
||||
0.6969
|
||||
0.6969
|
||||
0.6959
|
||||
0.6959
|
||||
0.6965
|
||||
0.6965
|
||||
0.6965
|
||||
0.6954
|
||||
0.6965
|
128
solution.py
128
solution.py
@ -1,120 +1,20 @@
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
|
||||
import pandas as pd
|
||||
import pickle
|
||||
import csv
|
||||
from random import uniform
|
||||
|
||||
from predict import predict
|
||||
from mytokenize import tokenize
|
||||
|
||||
#load data:
|
||||
train = pd.read_csv("train/in.tsv", delimiter="\t", header=None, names=["text","date"], quoting=csv.QUOTE_NONE)
|
||||
texts = train["text"]
|
||||
y = pd.read_csv("train/expected.tsv", header=None, names=["isparanormal"])
|
||||
y = list(y["isparanormal"])
|
||||
print(y)
|
||||
print(y[0])
|
||||
|
||||
tokenized_texts = []
|
||||
word_to_index = {}
|
||||
index_to_word = {}
|
||||
word_count = {}
|
||||
learning_rate = 0.000001
|
||||
|
||||
for doc in texts:
|
||||
tokens = tokenize(doc)
|
||||
tokenized_texts.append(tokens)
|
||||
for token in tokens:
|
||||
if token in word_count:
|
||||
word_count[token] += 1
|
||||
else:
|
||||
word_count[token] = 1
|
||||
|
||||
#vocabulary sorted from the most frequent words to the least f. w.
|
||||
vocabulary = sorted(word_count, key= word_count.get, reverse=True)
|
||||
for w in vocabulary:
|
||||
i = len(word_to_index) + 1
|
||||
word_to_index[w] = i
|
||||
index_to_word[i] = w
|
||||
|
||||
weights = []
|
||||
for i in range(0, len(vocabulary) + 1):
|
||||
weights.append(uniform(-0.01, 0.01))
|
||||
|
||||
|
||||
best_MSE = 1800
|
||||
|
||||
path_pkl ="BestLinearRegressionModel.pkl"
|
||||
if os.path.isfile(path_pkl):
|
||||
model = pickle.load(open(path_pkl, "rb"))
|
||||
word_to_index, weights, best_MSE = model
|
||||
print("Unpickled best model.", "\tbest_MSE: ", best_MSE)
|
||||
|
||||
|
||||
|
||||
loss_sum = 0.0
|
||||
loss_counter = 0
|
||||
for_MSE_sum = 0.0
|
||||
MSE = 0.0
|
||||
counter = 0
|
||||
inc_counter = 0
|
||||
while(True):
|
||||
for i in range(0, len(tokenized_texts)):
|
||||
#@@ Obliczanie ypred - start
|
||||
ypred = predict(weights, word_to_index, tokenized_texts[i])
|
||||
#@@ Obliczanie ypred - end
|
||||
|
||||
|
||||
#@@ Obliczanie kosztu
|
||||
loss = (ypred - y[i]) ** 2.0
|
||||
loss_sum += loss
|
||||
for_MSE_sum += loss
|
||||
loss_counter += 1
|
||||
#@@ Obliczanie kosztu - end
|
||||
|
||||
|
||||
if loss_counter == 1000:
|
||||
#print(str(loss_sum/1000), "\t", str(MSE))
|
||||
loss_counter = 0
|
||||
loss_sum = 0.0
|
||||
|
||||
|
||||
#@@ Uczenie - aktualizacja wag - start
|
||||
delta = (ypred - y[i]) * learning_rate
|
||||
weights[0] -= delta
|
||||
for x in tokenized_texts[i]:
|
||||
index = word_to_index[x]
|
||||
weights[index] -= delta
|
||||
#@@ Uczenie - aktualizacja wag - end
|
||||
|
||||
|
||||
#@@ Opcjonalna stratefia poprawiająca trenowanie- start
|
||||
temp_MSE = for_MSE_sum / len(tokenized_texts)
|
||||
for_MSE_sum = 0.0
|
||||
if best_MSE > temp_MSE:
|
||||
best_MSE = temp_MSE
|
||||
model = (word_to_index, weights, best_MSE)
|
||||
pickle.dump(model, open("BestLinearRegressionModel.pkl", "wb"))
|
||||
with open("score.txt", "w") as out:
|
||||
out.write("MSE:\t")
|
||||
out.write(str(best_MSE))
|
||||
if temp_MSE > MSE:
|
||||
counter += 1
|
||||
for line in sys.stdin:
|
||||
if re.search(r'UFO|paranormal|UFOs|video|night|house|saw|camera|lights|light|alien|aliens|ghost|object|dream|sky|room|ufo|craft|happened|sightings|footage|dreams|sleep', line):
|
||||
print("P")
|
||||
else:
|
||||
inc_counter += 1
|
||||
|
||||
if counter > 2:
|
||||
learning_rate *= 0.1
|
||||
counter = 0
|
||||
|
||||
if inc_counter > 4:
|
||||
learning_rate /= 0.90
|
||||
inc_counter = 0
|
||||
#@@ Opcjonalna stratefia poprawiająca trenowanie- end
|
||||
print("S")
|
||||
|
||||
|
||||
MSE = temp_MSE
|
||||
print("MSE: " "\t", "%10.10f" % MSE, "\tLearningRate:\t", "%10.10f" % float(learning_rate))
|
||||
|
||||
|
||||
|
||||
|
||||
"""
|
||||
|
||||
|
||||
happened|sightings|footage|dreams|sleep|videos|experiences|weird|objects|flying|strange|ET|photo|moving|fake|sighting|door|ghosts|looks|bed|spirits|paralysis|pictures|glitch|shadow|picture|space|photos|looked|phenomena|contact|spirit|stories|phenomenon|window|ufos|haunted|lol|creepy|lanterns|dark|scared|cameras|balloon|seen|beings|disclosure|story
|
||||
|
||||
"""
|
||||
|
7
start.sh
7
start.sh
@ -1,5 +1,4 @@
|
||||
xzcat dev-0/in.tsv.xz | python3 predict_dev0_testA.py > dev-0/out.tsv
|
||||
xzcat dev-0/in.tsv.xz | python3 solution.py > dev-0/out.tsv
|
||||
|
||||
xzcat test-A/in.tsv.xz | python3 predict_dev0_testA.py > test-A/out.tsv
|
||||
|
||||
geval -t dev-0
|
||||
xzcat test-A/in.tsv.xz | python3 solution.py > test-A/out.tsv
|
||||
geval -t dev-0 >>scores.txt
|
||||
|
10304
test-A/out.tsv
10304
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
43387
train/PsetsubtractionS_counted.txt
Normal file
43387
train/PsetsubtractionS_counted.txt
Normal file
File diff suppressed because it is too large
Load Diff
1
train/PssS_c_clean
Normal file
1
train/PssS_c_clean
Normal file
File diff suppressed because one or more lines are too long
579158
train/expected.tsv
579158
train/expected.tsv
File diff suppressed because it is too large
Load Diff
4
train/s.sh
Executable file
4
train/s.sh
Executable file
@ -0,0 +1,4 @@
|
||||
xzcat in.tsv.xz | paste expected.tsv - | egrep -o '^ P.*'| egrep -o '[[:alpha:]]+' | sort > sortedP
|
||||
xzcat in.tsv.xz | paste expected.tsv - | egrep -o '^ S.*'| egrep -o '[[:alpha:]]+' | sort > sortedS
|
||||
comm -23 sortedP sortedS > PsetsubtractionS
|
||||
cat PsetsubtractionS | uniq -c | sort -nr > PsetsubtractionS_counted.txt
|
Loading…
Reference in New Issue
Block a user