from multiprocessing.reduction import steal_handle import os import numpy as np import nltk.data # nltk.download('punkt') male_words1 = [ 'silnik', 'windows', 'gb', 'mb', 'meczu', 'pc', 'opony', 'apple', 'iphone', 'zwiastuny', 'hd', 'ubuntu', 'systemu', 'serwer', "www.youtube.com", "www.sfd.pl", "www.wykop.pl", "www.kfd.pl", "www.elektroda.pl", "www.autocentrum.pl", "www.dobreprogramy.pl", "flaker.pl", "www.myapple.pl", "youtube", "sfd", "kfd", "elektroda", "autocentrum", "dobreprogramy", ] female_words1 = [ 'ciąży', 'miesiączki', 'ciasto', 'ciążę', 'zadowolona', 'ciąża', 'ciazy', 'antykoncepcyjne', 'ginekologa', 'tabletki', 'porodzie', 'mąż', 'miesiączkę', 'krwawienie', 'ciasta', 'gwiazdunie.pl', 'www.photoblog.pl', 'szafa.pl', 'www.kotek.pl', 'parenting.pl', 'www.forum-turystyczne.pl', "www.babyboom.pl", "tematy.abcciaza.pl", "gwiazdunie", "photoblog", "kotek", "babyboom", "<3" ] def wordsStem(words): result = [] for word in words: result.append(word[:6]) return result #def sentenceLenClassificator(text): # threshold = 6 # sentences = tokenizer.tokenize(text) # avgSentLen = 0.0 # for sentence in sentences: # avgSentLen += len(sentence) # sentences_len = len(sentences) # if sentences_len == 0: return 1 # avgSentLen = avgSentLen/sentences_len # if avgSentLen > threshold: return 0 # else: return 1 def imbalanceWordsClassificator(text): splitSentence = text.split() countMale = 0 countFemale = 0 for word in splitSentence: # if word.lower() in male_words1: result + biasVal # elif word.lower() in female_words1: result - biasVal if word.lower()[:6] in male_words1: countMale += 1 elif word.lower()[:6] in female_words1: countFemale += 1 # normalize result # if result < 0.5: return 0 # else: return 1 if countMale >= countFemale: return 1 elif countMale < countFemale: return 0 # else: return sentenceLenClassificator(text) # len classificator # return sentenceLenClassificator(text) male_words1 = wordsStem(male_words1) female_words1 = wordsStem(female_words1) # tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') result = [] # input_path = './train/in.tsv' # output_path = './result.txt' input_path = './test-A/in.tsv' output_path = './expected-gonito.tsv' with open(input_path, 'r', encoding="utf8") as trainFile: Lines = trainFile.readlines() for line in Lines: result.append(imbalanceWordsClassificator(line)) # result.append(sentenceLenClassificator(line)) with open(output_path, mode='wt', encoding='utf-8') as myfile: myfile.write('\n'.join(str(line) for line in result)) # os.system("eval.py") print("-------end-------")