result and scripts

2022-04-23 09:58:35 +02:00 · 2022-04-23 09:58:35 +02:00 · 28e592072c
commit 28e592072c
parent 85a53fe007
3 changed files with 134783 additions and 0 deletions
--- a/eval.py
+++ b/eval.py
@ -0,0 +1,33 @@
 import csv
 def flatten(t):
    return [item for sublist in t for item in sublist]
 with open('./result.txt', mode='r', encoding='utf-8') as result:
        with open('./train/expected.tsv', 'r', encoding="utf8") as expected:
            result_vector = []
            expected_vector = []
            tsv_result = csv.reader(result, delimiter="\t")
            for line in tsv_result:
                result_vector.append(line)
            tsv_expected = csv.reader(expected, delimiter="\t")
            for line in tsv_expected:
                expected_vector.append(line)
            expected_vector = flatten(expected_vector)
            result_vector = flatten(result_vector)
            resultZip = zip(result_vector, expected_vector)
            matchNumber = 0.0
            for x,y in resultZip:
                if x == y:
                    matchNumber += 1.0
            print(matchNumber/len(expected_vector))
--- a/run.py
+++ b/run.py
@ -0,0 +1,132 @@
 from multiprocessing.reduction import steal_handle
 import os
 import numpy as np
 import nltk.data
 # nltk.download('punkt')
 male_words1 = [
    'silnik', 
    'windows', 
    'gb', 
    'mb', 
    'meczu', 
    'pc', 
    'opony', 
    'apple', 
    'iphone', 
    'zwiastuny', 
    'hd', 
    'ubuntu', 
    'systemu', 
    'serwer',
    "www.youtube.com",
    "www.sfd.pl",
    "www.wykop.pl",
    "www.kfd.pl",
    "www.elektroda.pl",
    "www.autocentrum.pl",
    "www.dobreprogramy.pl",
    "flaker.pl",
    "www.myapple.pl",
    "youtube",
    "sfd",
    "kfd",
    "elektroda",
    "autocentrum",
    "dobreprogramy",
    ]
 female_words1 = [
    'ciąży',
    'miesiączki',
    'ciasto',
    'ciążę',
    'zadowolona',
    'ciąża',
    'ciazy',
    'antykoncepcyjne',
    'ginekologa',
    'tabletki',
    'porodzie',
    'mąż',
    'miesiączkę',
    'krwawienie',
    'ciasta',
    'gwiazdunie.pl',
    'www.photoblog.pl',
    'szafa.pl',
    'www.kotek.pl',
    'parenting.pl',
    'www.forum-turystyczne.pl',
    "www.babyboom.pl",
    "tematy.abcciaza.pl",
    "gwiazdunie",
    "photoblog",
    "kotek",
    "babyboom",
    "<3"
 ]
 def wordsStem(words):
    result = []
    for word in words:
        result.append(word[:6])
    return result
 #def sentenceLenClassificator(text):
 #    threshold = 6
 #    sentences = tokenizer.tokenize(text)
 #    avgSentLen = 0.0
 #    for sentence in sentences:
 #        avgSentLen += len(sentence)
 #    sentences_len = len(sentences)
 #    if sentences_len == 0: return 1
 #    avgSentLen = avgSentLen/sentences_len
 #    if avgSentLen > threshold: return 0
 #    else: return 1
 def imbalanceWordsClassificator(text):
    splitSentence = text.split()
    countMale = 0
    countFemale = 0
    for word in splitSentence:
        # if word.lower() in male_words1: result + biasVal
        # elif word.lower() in female_words1: result - biasVal
        if word.lower()[:6] in male_words1: countMale += 1
        elif word.lower()[:6] in female_words1: countFemale += 1
    # normalize result
    # if result < 0.5: return 0
    # else: return 1
    if countMale >= countFemale: return 1
    elif countMale < countFemale: return 0
    # else: return sentenceLenClassificator(text)
    # len classificator
    # return sentenceLenClassificator(text)
 male_words1 = wordsStem(male_words1)
 female_words1 = wordsStem(female_words1)
 # tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
 result = []
 # input_path = './train/in.tsv'
 # output_path = './result.txt'
 input_path = './test-A/in.tsv'
 output_path = './expected-gonito.tsv'
 with open(input_path, 'r', encoding="utf8") as trainFile:
    Lines = trainFile.readlines()
    for line in Lines:
        result.append(imbalanceWordsClassificator(line))
        # result.append(sentenceLenClassificator(line))
    with open(output_path, mode='wt', encoding='utf-8') as myfile:
            myfile.write('\n'.join(str(line) for line in result))
 # os.system("eval.py")
 print("-------end-------")
--- a/test-A/out.tsv
+++ b/test-A/out.tsv