petite-difference-challenge2/run.py

from multiprocessing.reduction import steal_handle
import os
import numpy as np
import nltk.data
# nltk.download('punkt')

male_words1 = [
    'silnik',
    'windows',
    'gb',
    'mb',
    'meczu',
    'pc',
    'opony',
    'apple',
    'iphone',
    'zwiastuny',
    'hd',
    'ubuntu',
    'systemu',
    'serwer',
    "www.youtube.com",
    "www.sfd.pl",
    "www.wykop.pl",
    "www.kfd.pl",
    "www.elektroda.pl",
    "www.autocentrum.pl",
    "www.dobreprogramy.pl",
    "flaker.pl",
    "www.myapple.pl",
    "youtube",
    "sfd",
    "kfd",
    "elektroda",
    "autocentrum",
    "dobreprogramy",
    ]


female_words1 = [
    'ciąży',
    'miesiączki',
    'ciasto',
    'ciążę',
    'zadowolona',
    'ciąża',
    'ciazy',
    'antykoncepcyjne',
    'ginekologa',
    'tabletki',
    'porodzie',
    'mąż',
    'miesiączkę',
    'krwawienie',
    'ciasta',
    'gwiazdunie.pl',
    'www.photoblog.pl',
    'szafa.pl',
    'www.kotek.pl',
    'parenting.pl',
    'www.forum-turystyczne.pl',
    "www.babyboom.pl",
    "tematy.abcciaza.pl",
    "gwiazdunie",
    "photoblog",
    "kotek",
    "babyboom",
    "<3"
]

def wordsStem(words):
    result = []
    for word in words:
        result.append(word[:6])
    return result

#def sentenceLenClassificator(text):
#    threshold = 6
#    sentences = tokenizer.tokenize(text)
#    avgSentLen = 0.0
#    for sentence in sentences:
#        avgSentLen += len(sentence)
#    sentences_len = len(sentences)
#    if sentences_len == 0: return 1
#    avgSentLen = avgSentLen/sentences_len
#    if avgSentLen > threshold: return 0
#    else: return 1

def imbalanceWordsClassificator(text):
    splitSentence = text.split()
    countMale = 0
    countFemale = 0
    for word in splitSentence:
        # if word.lower() in male_words1: result + biasVal
        # elif word.lower() in female_words1: result - biasVal
        if word.lower()[:6] in male_words1: countMale += 1
        elif word.lower()[:6] in female_words1: countFemale += 1
    # normalize result
    # if result < 0.5: return 0
    # else: return 1
    if countMale >= countFemale: return 1
    elif countMale < countFemale: return 0
    # else: return sentenceLenClassificator(text)
    # len classificator
    # return sentenceLenClassificator(text)

male_words1 = wordsStem(male_words1)
female_words1 = wordsStem(female_words1)

# tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

result = []

# input_path = './train/in.tsv'
# output_path = './result.txt'
input_path = './test-A/in.tsv'
output_path = './expected-gonito.tsv'

with open(input_path, 'r', encoding="utf8") as trainFile:
    Lines = trainFile.readlines()

    for line in Lines:
        result.append(imbalanceWordsClassificator(line))
        # result.append(sentenceLenClassificator(line))


    with open(output_path, mode='wt', encoding='utf-8') as myfile:
            myfile.write('\n'.join(str(line) for line in result))

# os.system("eval.py")
print("-------end-------")