3
This commit is contained in:
parent
a2064b7ed9
commit
2be2a96fe1
19454
dev-0/out.tsv
19454
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
23
run.py
23
run.py
@ -12,7 +12,6 @@ data = pd.read_csv(
|
||||
error_bad_lines=False,
|
||||
header=None,
|
||||
quoting=csv.QUOTE_NONE,
|
||||
nrows=250000
|
||||
)
|
||||
train_labels = pd.read_csv(
|
||||
"train/expected.tsv",
|
||||
@ -20,7 +19,6 @@ train_labels = pd.read_csv(
|
||||
error_bad_lines=False,
|
||||
header=None,
|
||||
quoting=csv.QUOTE_NONE,
|
||||
nrows=250000
|
||||
)
|
||||
|
||||
train_data = data[[6, 7]]
|
||||
@ -33,22 +31,17 @@ model = defaultdict(lambda: defaultdict(lambda: 0))
|
||||
|
||||
def clean(text):
|
||||
text = str(text)
|
||||
# normalize text
|
||||
text = (
|
||||
unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode(
|
||||
'utf-8', 'ignore'))
|
||||
# replace html chars with ' '
|
||||
text = re.sub('<.*?>', ' ', text)
|
||||
# remove punctuation
|
||||
text = text.translate(str.maketrans(' ', ' ', string.punctuation))
|
||||
# only alphabets and numerics
|
||||
text = re.sub('[^a-zA-Z]', ' ', text)
|
||||
# replace newline with space
|
||||
unicodedata.normalize("NFKD", text)
|
||||
.encode("ascii", "ignore")
|
||||
.decode("utf-8", "ignore")
|
||||
)
|
||||
text = re.sub("<.*?>", " ", text)
|
||||
text = text.translate(str.maketrans(" ", " ", string.punctuation))
|
||||
text = re.sub("[^a-zA-Z]", " ", text)
|
||||
text = re.sub("\n", " ", text)
|
||||
# lower case
|
||||
text = text.lower()
|
||||
# split and join the words
|
||||
text = ' '.join(text.split())
|
||||
text = " ".join(text.split())
|
||||
return text
|
||||
|
||||
|
||||
|
13608
test-A/out.tsv
13608
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user