Compare commits
No commits in common. "master" and "ISI-13-date" have entirely different histories.
master
...
ISI-13-dat
5
.gitignore
vendored
5
.gitignore
vendored
@ -1,8 +1,11 @@
|
|||||||
|
in.tsv
|
||||||
|
model.pkl
|
||||||
*~
|
*~
|
||||||
*.swp
|
*.swp
|
||||||
*.bak
|
*.bak
|
||||||
*.pyc
|
*.pyc
|
||||||
*.o
|
*.o
|
||||||
|
*.pkl
|
||||||
.DS_Store
|
.DS_Store
|
||||||
.token
|
.token
|
||||||
|
.idea
|
@ -5,7 +5,7 @@ Classify a reddit as either from Skeptic subreddit or one of the
|
|||||||
"paranormal" subreddits (Paranormal, UFOs, TheTruthIsHere, Ghosts,
|
"paranormal" subreddits (Paranormal, UFOs, TheTruthIsHere, Ghosts,
|
||||||
,Glitch-in-the-Matrix, conspiracytheories).
|
,Glitch-in-the-Matrix, conspiracytheories).
|
||||||
|
|
||||||
Output label is `S` and `P`.
|
Output label is 0 (for skeptic) and 1 (for paranormal).
|
||||||
|
|
||||||
Sources
|
Sources
|
||||||
-------
|
-------
|
||||||
|
@ -1 +1 @@
|
|||||||
--metric Accuracy --precision 4
|
--metric Accuracy --metric F1 --metric F0:N<Precision> --metric F9999999:N<Recall> --precision 4 --in-header in-header.tsv --out-header out-header.tsv
|
||||||
|
10544
dev-0/expected.tsv
10544
dev-0/expected.tsv
File diff suppressed because it is too large
Load Diff
10544
dev-0/out.tsv
10544
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
12
mytokenize.py
Normal file
12
mytokenize.py
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
import re
|
||||||
|
|
||||||
|
"""
|
||||||
|
Takes a document and returns a list of tokens.
|
||||||
|
"""
|
||||||
|
def tokenize(d):
|
||||||
|
d = re.sub(r'(\s+|\\n)', ' ', d)
|
||||||
|
d = re.sub(r'(https?:|www)\S+(\s+|$)', ' URL ', d)
|
||||||
|
d = d.lower().replace(".", " .").replace(",", " ,").replace("?", " ?").replace("!", " !")
|
||||||
|
d = re.sub(r'\d+', 'NUM', d)
|
||||||
|
|
||||||
|
return re.split(r'\s+', d)
|
10
predict.py
Normal file
10
predict.py
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def predict(weights, word_to_index, tokenized_text):
|
||||||
|
ypred = weights[0] # bias or w0
|
||||||
|
for x in tokenized_text:
|
||||||
|
index = word_to_index[x]
|
||||||
|
ypred += weights[index] * 1
|
||||||
|
return ypred
|
83
regresja liniowa pseudokod.txt
Normal file
83
regresja liniowa pseudokod.txt
Normal file
@ -0,0 +1,83 @@
|
|||||||
|
vocabulary = .... zbiór wszystkich słów ...
|
||||||
|
|
||||||
|
word_to_index_mapping = {}
|
||||||
|
|
||||||
|
index_to_word_mapping = {}
|
||||||
|
|
||||||
|
ix = 1
|
||||||
|
|
||||||
|
for w in vocabulary:
|
||||||
|
|
||||||
|
word_to_index_mapping[w] = ix
|
||||||
|
|
||||||
|
index_to_word_mapping[ix] = w
|
||||||
|
|
||||||
|
ix += 1
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#inicjalizacja
|
||||||
|
|
||||||
|
weights = []
|
||||||
|
|
||||||
|
for ix in xrange(0, len(vocabulary)+1):
|
||||||
|
|
||||||
|
weights[ix] = losowa wartość z przedziału (-0.01, 0.01)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Albo weights[0] = 2012.0
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
learning_rate = 0.000001
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Loss_sum = 0.0
|
||||||
|
|
||||||
|
Loss_sum_counter = 0
|
||||||
|
|
||||||
|
while ....
|
||||||
|
|
||||||
|
d, y = losowy przykład ze zbioru uczący
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# predykcja
|
||||||
|
|
||||||
|
y_hat = weights[0]
|
||||||
|
|
||||||
|
dla każdego słowa w z dokumentu d:
|
||||||
|
|
||||||
|
y_hat += weights[word_to_index_mapping[w]] * (liczba wystąpień w w d)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# funkcja kosztu
|
||||||
|
|
||||||
|
Loss = (y_hat – y)**2.0
|
||||||
|
|
||||||
|
Loss_sum += Loss
|
||||||
|
|
||||||
|
If loss_counter % 1000 == 0:
|
||||||
|
|
||||||
|
print(loss_sum / 1000)
|
||||||
|
|
||||||
|
Loss_counter = 0
|
||||||
|
|
||||||
|
Loss_sum = 0.0
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# uczenie - update
|
||||||
|
|
||||||
|
delta = (y_hat - y) * learning_rate
|
||||||
|
|
||||||
|
weights[0] = weights[0] - delta
|
||||||
|
|
||||||
|
for każdego słowa w z dokumenty d:
|
||||||
|
|
||||||
|
weights[word_to_index_mapping[w]] -= (liczba wystąpień w w d) * delta
|
||||||
|
|
||||||
|
|
15
scores.txt
15
scores.txt
@ -1,15 +0,0 @@
|
|||||||
0.6920
|
|
||||||
0.6857
|
|
||||||
0.6969
|
|
||||||
0.6931
|
|
||||||
0.6927
|
|
||||||
0.6952
|
|
||||||
0.6969
|
|
||||||
0.6969
|
|
||||||
0.6959
|
|
||||||
0.6959
|
|
||||||
0.6965
|
|
||||||
0.6965
|
|
||||||
0.6965
|
|
||||||
0.6954
|
|
||||||
0.6965
|
|
126
solution.py
126
solution.py
@ -1,20 +1,126 @@
|
|||||||
import re
|
import pandas as pd
|
||||||
import sys
|
import pickle
|
||||||
|
import csv
|
||||||
|
from random import uniform
|
||||||
|
|
||||||
for line in sys.stdin:
|
from predict import predict
|
||||||
if re.search(r'UFO|paranormal|UFOs|video|night|house|saw|camera|lights|light|alien|aliens|ghost|object|dream|sky|room|ufo|craft|happened|sightings|footage|dreams|sleep', line):
|
from mytokenize import tokenize
|
||||||
print("P")
|
|
||||||
|
#load data:
|
||||||
|
train = pd.read_csv("train/in.tsv", delimiter="\t", header=None, names=["text","date"], quoting=csv.QUOTE_NONE)
|
||||||
|
texts = train["text"]
|
||||||
|
y = train["date"]
|
||||||
|
temp_y = []
|
||||||
|
for posix_time in y:
|
||||||
|
floatyear = 1970 + (posix_time / (60*60*24 * 365.25))
|
||||||
|
temp_y.append(floatyear)
|
||||||
|
|
||||||
|
y = temp_y
|
||||||
|
print(y[0])
|
||||||
|
tokenized_texts = []
|
||||||
|
word_to_index = {}
|
||||||
|
index_to_word = {}
|
||||||
|
word_count = {}
|
||||||
|
learning_rate = 0.000001
|
||||||
|
|
||||||
|
for doc in texts:
|
||||||
|
tokens = tokenize(doc)
|
||||||
|
tokenized_texts.append(tokens)
|
||||||
|
for token in tokens:
|
||||||
|
if token in word_count:
|
||||||
|
word_count[token] += 1
|
||||||
|
else:
|
||||||
|
word_count[token] = 1
|
||||||
|
|
||||||
|
#vocabulary sorted from the most frequent words to the least f. w.
|
||||||
|
vocabulary = sorted(word_count, key= word_count.get, reverse=True)
|
||||||
|
for w in vocabulary:
|
||||||
|
i = len(word_to_index) + 1
|
||||||
|
word_to_index[w] = i
|
||||||
|
index_to_word[i] = w
|
||||||
|
|
||||||
|
weights = []
|
||||||
|
for i in range(0, len(vocabulary) + 1):
|
||||||
|
weights.append(2012 * uniform(-0.01, 0.01))
|
||||||
|
|
||||||
|
|
||||||
|
best_MSE = 408
|
||||||
|
|
||||||
|
model = pickle.load(open("BestLinearRegressionModel.pkl", "rb"))
|
||||||
|
word_to_index, weights, best_MSE = model
|
||||||
|
print("Unpickled best model.", "\tbest_MSE: ", best_MSE)
|
||||||
|
|
||||||
|
|
||||||
|
loss_sum = 0.0
|
||||||
|
loss_counter = 0
|
||||||
|
for_MSE_sum = 0.0
|
||||||
|
MSE = 0.0
|
||||||
|
counter = 0
|
||||||
|
inc_counter = 0
|
||||||
|
while(True):
|
||||||
|
for i in range(0, len(tokenized_texts)):
|
||||||
|
"""#@@ Obliczanie ypred - start
|
||||||
|
ypred = weights[0]#bias or w0
|
||||||
|
for x in tokenized_texts[i]:
|
||||||
|
index = word_to_index[x]
|
||||||
|
ypred += weights[index] * index
|
||||||
|
#@@ Obliczanie ypred - end"""
|
||||||
|
ypred = predict(weights, word_to_index, tokenized_texts[i])
|
||||||
|
|
||||||
|
#@@ Obliczanie kosztu
|
||||||
|
loss = (ypred - y[i]) ** 2.0
|
||||||
|
loss_sum += loss
|
||||||
|
for_MSE_sum += loss
|
||||||
|
loss_counter += 1
|
||||||
|
if loss_counter == 1000:
|
||||||
|
#print(str(loss_sum/1000), "\t", str(MSE))
|
||||||
|
loss_counter = 0
|
||||||
|
loss_sum = 0.0
|
||||||
|
|
||||||
|
#@@ Obliczanie kosztu - end
|
||||||
|
|
||||||
|
#@@ Uczenie - aktualizacja wag
|
||||||
|
delta = (ypred - y[i]) * learning_rate
|
||||||
|
weights[0] -= delta
|
||||||
|
for x in tokenized_texts[i]:
|
||||||
|
index = word_to_index[x]
|
||||||
|
weights[index] -= delta
|
||||||
|
#@@ Uczenie - aktualizacja wag - end
|
||||||
|
|
||||||
|
temp_MSE = for_MSE_sum / len(tokenized_texts)
|
||||||
|
for_MSE_sum = 0.0
|
||||||
|
if best_MSE > temp_MSE:
|
||||||
|
best_MSE = temp_MSE
|
||||||
|
model = (word_to_index, weights, best_MSE)
|
||||||
|
pickle.dump(model, open("BestLinearRegressionModel.pkl", "wb"))
|
||||||
|
with open("score.txt", "w") as out:
|
||||||
|
out.write("MSE:\t")
|
||||||
|
out.write(str(best_MSE))
|
||||||
|
if temp_MSE > MSE:
|
||||||
|
counter += 1
|
||||||
else:
|
else:
|
||||||
print("S")
|
inc_counter += 1
|
||||||
|
|
||||||
|
|
||||||
|
if counter > 2:
|
||||||
|
learning_rate *= 0.1
|
||||||
|
counter = 0
|
||||||
|
|
||||||
|
if inc_counter > 4:
|
||||||
|
learning_rate /= 0.90
|
||||||
|
inc_counter = 0
|
||||||
|
MSE = temp_MSE
|
||||||
|
print("MSE: " "\t", "%10.10f" % MSE, "\tLearningRate:\t", "%10.10f" % float(learning_rate))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
with open("dev-0/out.tsv", "w") as out1:
|
||||||
|
for line in predicted_dev0:
|
||||||
|
out1.write(str(line))
|
||||||
|
out1.write("\n")
|
||||||
|
|
||||||
|
with open("test-A/out.tsv", "w") as out2:
|
||||||
happened|sightings|footage|dreams|sleep|videos|experiences|weird|objects|flying|strange|ET|photo|moving|fake|sighting|door|ghosts|looks|bed|spirits|paralysis|pictures|glitch|shadow|picture|space|photos|looked|phenomena|contact|spirit|stories|phenomenon|window|ufos|haunted|lol|creepy|lanterns|dark|scared|cameras|balloon|seen|beings|disclosure|story
|
for line in predicted_testA:
|
||||||
|
out2.write(str(line))
|
||||||
|
out2.write("\n")
|
||||||
"""
|
"""
|
4
start.sh
4
start.sh
@ -1,4 +0,0 @@
|
|||||||
xzcat dev-0/in.tsv.xz | python3 solution.py > dev-0/out.tsv
|
|
||||||
|
|
||||||
xzcat test-A/in.tsv.xz | python3 solution.py > test-A/out.tsv
|
|
||||||
geval -t dev-0 >>scores.txt
|
|
10304
test-A/out.tsv
10304
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
579158
train/expected.tsv
579158
train/expected.tsv
File diff suppressed because it is too large
Load Diff
@ -1,4 +0,0 @@
|
|||||||
xzcat in.tsv.xz | paste expected.tsv - | egrep -o '^ P.*'| egrep -o '[[:alpha:]]+' | sort > sortedP
|
|
||||||
xzcat in.tsv.xz | paste expected.tsv - | egrep -o '^ S.*'| egrep -o '[[:alpha:]]+' | sort > sortedS
|
|
||||||
comm -23 sortedP sortedS > PsetsubtractionS
|
|
||||||
cat PsetsubtractionS | uniq -c | sort -nr > PsetsubtractionS_counted.txt
|
|
Loading…
Reference in New Issue
Block a user