diff --git a/main.py b/main.py deleted file mode 100755 index bb966bc..0000000 --- a/main.py +++ /dev/null @@ -1,35 +0,0 @@ -from pyspark.sql import SparkSession -from pyspark.sql.functions import concat, lit, col -from pyspark.sql.types import StructType,StructField, StringType, IntegerType - - - -if __name__ == '__main__': - spark = SparkSession\ - .builder\ - .appName('He Said She Said')\ - .getOrCreate() - - X_schema = StructType([StructField('X', StringType(), True)]) - - X_train = spark.read.csv('train/in.tsv', schema=X_schema) - Y_train = spark.read.csv('train/expected.tsv', schema=StructType([StructField('Y', IntegerType(), True)])) - - X_train.show() - - train = X_train.join(Y_train, how='full_outer') - train.show() - - # # $example on$ - # # Load training data - # data = spark.read.format("libsvm") \ - # .load("sample_libsvm_data.txt") - - # print('data = ', data) - - # # Split the data into train and test - # splits = data.randomSplit([0.6, 0.4], 1234) - # train = splits[0] - # test = splits[1] - - # print('train = ', train) \ No newline at end of file diff --git a/run.py b/run.py new file mode 100755 index 0000000..e69de29