result and scripts

2022-04-23 09:58:35 +02:00 · 2022-04-23 09:58:35 +02:00 · 28e592072c
commit 28e592072c
parent 85a53fe007
3 changed files with 134783 additions and 0 deletions
--- a/eval.py
+++ b/eval.py
@ -0,0 +1,33 @@
+import csv
+
+def flatten(t):
+    return [item for sublist in t for item in sublist]
+
+with open('./result.txt', mode='r', encoding='utf-8') as result:
+        with open('./train/expected.tsv', 'r', encoding="utf8") as expected:
+            
+            result_vector = []
+            expected_vector = []
+            
+            tsv_result = csv.reader(result, delimiter="\t")
+            for line in tsv_result:
+                result_vector.append(line)
+
+            tsv_expected = csv.reader(expected, delimiter="\t")
+            for line in tsv_expected:
+                expected_vector.append(line)
+    
+
+            expected_vector = flatten(expected_vector)
+            result_vector = flatten(result_vector)
+
+            resultZip = zip(result_vector, expected_vector)
+
+            matchNumber = 0.0
+            for x,y in resultZip:
+                if x == y:
+                    matchNumber += 1.0
+            
+            print(matchNumber/len(expected_vector))
+            
+
--- a/run.py
+++ b/run.py
@ -0,0 +1,132 @@
+from multiprocessing.reduction import steal_handle
+import os
+import numpy as np
+import nltk.data
+# nltk.download('punkt')
+
+male_words1 = [
+    'silnik', 
+    'windows', 
+    'gb', 
+    'mb', 
+    'meczu', 
+    'pc', 
+    'opony', 
+    'apple', 
+    'iphone', 
+    'zwiastuny', 
+    'hd', 
+    'ubuntu', 
+    'systemu', 
+    'serwer',
+    "www.youtube.com",
+    "www.sfd.pl",
+    "www.wykop.pl",
+    "www.kfd.pl",
+    "www.elektroda.pl",
+    "www.autocentrum.pl",
+    "www.dobreprogramy.pl",
+    "flaker.pl",
+    "www.myapple.pl",
+    "youtube",
+    "sfd",
+    "kfd",
+    "elektroda",
+    "autocentrum",
+    "dobreprogramy",
+    ]
+
+
+
+female_words1 = [
+    'ciąży',
+    'miesiączki',
+    'ciasto',
+    'ciążę',
+    'zadowolona',
+    'ciąża',
+    'ciazy',
+    'antykoncepcyjne',
+    'ginekologa',
+    'tabletki',
+    'porodzie',
+    'mąż',
+    'miesiączkę',
+    'krwawienie',
+    'ciasta',
+    'gwiazdunie.pl',
+    'www.photoblog.pl',
+    'szafa.pl',
+    'www.kotek.pl',
+    'parenting.pl',
+    'www.forum-turystyczne.pl',
+    "www.babyboom.pl",
+    "tematy.abcciaza.pl",
+    "gwiazdunie",
+    "photoblog",
+    "kotek",
+    "babyboom",
+    "<3"
+]
+
+def wordsStem(words):
+    result = []
+    for word in words:
+        result.append(word[:6])
+    return result
+
+#def sentenceLenClassificator(text):
+#    threshold = 6
+#    sentences = tokenizer.tokenize(text)
+#    avgSentLen = 0.0
+#    for sentence in sentences:
+#        avgSentLen += len(sentence)
+#    sentences_len = len(sentences)
+#    if sentences_len == 0: return 1
+#    avgSentLen = avgSentLen/sentences_len
+#    if avgSentLen > threshold: return 0
+#    else: return 1
+
+def imbalanceWordsClassificator(text):
+    splitSentence = text.split()
+    countMale = 0
+    countFemale = 0
+    for word in splitSentence:
+        # if word.lower() in male_words1: result + biasVal
+        # elif word.lower() in female_words1: result - biasVal
+        if word.lower()[:6] in male_words1: countMale += 1
+        elif word.lower()[:6] in female_words1: countFemale += 1
+    # normalize result
+    # if result < 0.5: return 0
+    # else: return 1
+    if countMale >= countFemale: return 1
+    elif countMale < countFemale: return 0
+    # else: return sentenceLenClassificator(text)
+    # len classificator
+    # return sentenceLenClassificator(text)
+
+male_words1 = wordsStem(male_words1)
+female_words1 = wordsStem(female_words1)
+
+# tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
+
+result = []
+
+# input_path = './train/in.tsv'
+# output_path = './result.txt'
+input_path = './test-A/in.tsv'
+output_path = './expected-gonito.tsv'
+
+with open(input_path, 'r', encoding="utf8") as trainFile:
+    Lines = trainFile.readlines()
+
+    for line in Lines:
+        result.append(imbalanceWordsClassificator(line))
+        # result.append(sentenceLenClassificator(line))
+
+
+    with open(output_path, mode='wt', encoding='utf-8') as myfile:
+            myfile.write('\n'.join(str(line) for line in result))
+
+# os.system("eval.py")
+print("-------end-------")
--- a/test-A/out.tsv
+++ b/test-A/out.tsv