paranormal-skeptic-tfidf/main.py

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
import csv


def generate_output(predicted_proba, path):
    f3 = open(path, "w")

    predicted_proba[predicted_proba < 0.05] = 0.05
    predicted_proba[predicted_proba > 0.95] = 0.95

    string = ""
    for probability in predicted_proba:
        string += f"{probability[1]}\n"
    f3.write(string)


training_data = pd.read_csv('train/in.tsv.xz', header=None, names=["text", "date"], quoting=csv.QUOTE_NONE)
y_train = pd.read_csv('train/expected.tsv', header=None, sep='	')

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(training_data['text'].values.astype('U'))

# transform a count matrix to a normalized tf-idf representation (tf-idf transformer)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

knn = KNeighborsClassifier(n_neighbors=15)

# training our classifier ; y_train will have numbers assigned for each category in train data
clf = knn.fit(X_train_tfidf, y_train)

# Input Data to predict their classes of the given y_train
dev_data = pd.read_csv("dev-0/in.tsv.xz", header=None, sep='\t', error_bad_lines=False)
# building up feature vector of our input
X_new_counts = count_vect.transform(dev_data[0])
# We call transform instead of fit_transform because it's already been fit
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

print("Generating dev-0 output...")
predicted_proba_dev = knn.predict_proba(X_new_tfidf)
generate_output(predicted_proba_dev, "dev-0/out.tsv")
print("Generated dev-0 output!")

# Input Data to predict their classes of the given y_train
test_data = pd.read_csv("test-A/in.tsv.xz", header=None, sep='\t', error_bad_lines=False)

# building up feature vector of our input
X_new_counts = count_vect.transform(test_data[0])
# We call transform instead of fit_transform because it's already been fit
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

print("Generating test-A output...")
predicted_proba_test = knn.predict_proba(X_new_tfidf)
generate_output(predicted_proba_test, "test-A/out.tsv")
print("Generated test-A output!")
Initial 2020-06-07 12:52:17 +02:00			`from sklearn.feature_extraction.text import CountVectorizer`
			`from sklearn.feature_extraction.text import TfidfTransformer`
			`from sklearn.neighbors import KNeighborsClassifier`
			`import pandas as pd`
			`import csv`


			`def generate_output(predicted_proba, path):`
			`f3 = open(path, "w")`

			`predicted_proba[predicted_proba < 0.05] = 0.05`
			`predicted_proba[predicted_proba > 0.95] = 0.95`

			`string = ""`
			`for probability in predicted_proba:`
			`string += f"{probability[1]}\n"`
			`f3.write(string)`


			`training_data = pd.read_csv('train/in.tsv.xz', header=None, names=["text", "date"], quoting=csv.QUOTE_NONE)`
			`y_train = pd.read_csv('train/expected.tsv', header=None, sep=' ')`

			`count_vect = CountVectorizer()`
			`X_train_counts = count_vect.fit_transform(training_data['text'].values.astype('U'))`

			`# transform a count matrix to a normalized tf-idf representation (tf-idf transformer)`
			`tfidf_transformer = TfidfTransformer()`
			`X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)`

			`knn = KNeighborsClassifier(n_neighbors=15)`

			`# training our classifier ; y_train will have numbers assigned for each category in train data`
			`clf = knn.fit(X_train_tfidf, y_train)`

			`# Input Data to predict their classes of the given y_train`
			`dev_data = pd.read_csv("dev-0/in.tsv.xz", header=None, sep='\t', error_bad_lines=False)`
			`# building up feature vector of our input`
			`X_new_counts = count_vect.transform(dev_data[0])`
			`# We call transform instead of fit_transform because it's already been fit`
			`X_new_tfidf = tfidf_transformer.transform(X_new_counts)`

			`print("Generating dev-0 output...")`
			`predicted_proba_dev = knn.predict_proba(X_new_tfidf)`
			`generate_output(predicted_proba_dev, "dev-0/out.tsv")`
			`print("Generated dev-0 output!")`

			`# Input Data to predict their classes of the given y_train`
			`test_data = pd.read_csv("test-A/in.tsv.xz", header=None, sep='\t', error_bad_lines=False)`

			`# building up feature vector of our input`
			`X_new_counts = count_vect.transform(test_data[0])`
			`# We call transform instead of fit_transform because it's already been fit`
			`X_new_tfidf = tfidf_transformer.transform(X_new_counts)`

			`print("Generating test-A output...")`
			`predicted_proba_test = knn.predict_proba(X_new_tfidf)`
			`generate_output(predicted_proba_test, "test-A/out.tsv")`
			`print("Generated test-A output!")`