diff --git a/dev-0/out.tsv b/dev-0/out.tsv index 92eb1fa..d328e94 100644 --- a/dev-0/out.tsv +++ b/dev-0/out.tsv @@ -2399,7 +2399,7 @@ 1882.9509376976243 2006.2831425999086 1901.8236927643368 -1845.8431605627075 +1846.2982848835586 1907.4807609148313 1919.1770226218323 1808.6714849322975 @@ -4770,7 +4770,7 @@ 1869.227930644483 1906.3844663610223 1945.3424395378358 -1897.9459278201575 +1897.8191028561196 1902.4248382081591 1916.941205362806 1951.3650770196946 @@ -6336,7 +6336,8 @@ 1974.3692055493452 1851.1544441272165 1837.2440449549101 -1921.004623698161 +1906.469703262039 +1941.217890583569 1863.050896361201 1935.4381446070022 1838.6591222095797 @@ -11780,7 +11781,7 @@ 1971.261305356721 1965.7895878732024 1894.1926433945762 -1843.5769117112493 +1843.4054815753093 1798.5510049748643 1874.6762363908592 1933.9946144817523 @@ -11868,7 +11869,7 @@ 1933.1659502110472 1948.6097367533557 1892.0990307008867 -1881.7539718505154 +1882.4301864075958 1898.8163564591582 1891.7720648916354 1899.1896641539054 @@ -13607,7 +13608,8 @@ 1899.4677481479275 1832.0157188177106 1830.994476738317 -1859.8255318357274 +1872.857763141983 +1887.1141759579975 1821.8836244133079 1913.4663037228643 1985.0371631137966 @@ -14150,7 +14152,7 @@ 1958.9030832469919 1917.5848215968706 1905.0147395763986 -1871.6714142048124 +1871.3321978094336 1901.0391754660595 1871.966900926708 1965.8106578756367 @@ -17368,7 +17370,7 @@ 1863.7139407495256 1980.5486905106713 1820.0882670928129 -1856.3160414564259 +1855.6257391640597 1921.6397748707468 1970.804687384616 1902.8181073379312 @@ -19795,7 +19797,7 @@ 1917.3208523541055 1850.9065744029447 1912.2333094584328 -1837.7607134713 +1837.590902324513 1969.993000236959 1956.0008619219907 1941.4140218594555 diff --git a/main.py b/main.py index b0f27c1..b0e1dad 100644 --- a/main.py +++ b/main.py @@ -1,3 +1,4 @@ +import csv import pandas as pd from sklearn.linear_model import LinearRegression from sklearn.pipeline import make_pipeline @@ -6,8 +7,8 @@ from sklearn.feature_extraction.text import TfidfVectorizer col_names = ['start_date', 'end_date', 'title', 'source', 'content'] train_set = pd.read_table('train/train.tsv.xz', error_bad_lines=False, header=None, names=col_names) -dev_set = pd.read_table('dev-0/in.tsv', error_bad_lines=False, header=None, names=col_names[4:]) -test_set = pd.read_table('test-A/in.tsv', error_bad_lines=False, header=None, names=col_names[4:]) +dev_set = pd.read_table('dev-0/in.tsv', error_bad_lines=False, header=None, names=col_names[4:], quoting=csv.QUOTE_NONE,) +test_set = pd.read_table('test-A/in.tsv', error_bad_lines=False, header=None, names=col_names[4:], quoting=csv.QUOTE_NONE,) train_set = train_set.head(10000) @@ -17,6 +18,9 @@ y_train = (train_set['start_date'] + train_set['end_date']) / 2 X_dev = dev_set['content'] X_test = test_set['content'] +print(dev_set) +print(test_set) + print('Trenowanie modelu...') model = make_pipeline(TfidfVectorizer(), LinearRegression()) model.fit(X_train, y_train) diff --git a/test-A/out.tsv b/test-A/out.tsv index dade4f0..1c618ad 100644 --- a/test-A/out.tsv +++ b/test-A/out.tsv @@ -206,7 +206,7 @@ 1928.6587695672113 1990.9156424576067 1881.4836785253315 -1884.4519979671402 +1884.046089520049 1877.4040287725156 1911.8947608148899 1899.7628344836432 @@ -961,7 +961,7 @@ 1920.806099260425 1934.7567428867874 2000.6897694955856 -1914.538168494774 +1915.0424831770722 1961.5348637593509 1976.0306055365545 1990.438972379044 @@ -1458,7 +1458,7 @@ 1897.0381253809762 1898.2169614742145 1907.1817765399285 -1899.7781849186872 +1900.5051189207925 1916.2254375826196 1887.7589605584237 1900.9036082026578 @@ -1944,7 +1944,7 @@ 1932.4369032498905 1924.7608021483702 1932.6665753305242 -1910.2637014903585 +1911.1136741658142 1844.3306216770322 1920.9735284228261 1898.4191575441814 @@ -3448,7 +3448,8 @@ 1985.828576031535 1929.0980372984723 1963.416770426545 -1983.403262556507 +1958.0061961875651 +1979.5713424319551 1927.054394299714 1887.5844594202247 1952.952473384423 @@ -5241,7 +5242,7 @@ 1961.7395028010487 1911.578177275467 1993.361626034721 -1860.938964267848 +1861.582235755756 1880.8873010116863 1891.0584427244678 1927.9286642737504 @@ -6326,7 +6327,7 @@ 1894.4410835153267 1961.8656695163497 1918.6540411919113 -1902.5162709462134 +1901.772008274914 1970.120121368801 1958.8069602196501 1822.1082097880283 @@ -8450,7 +8451,7 @@ 1937.635613281464 1931.6145334405564 1911.15006803172 -1937.8085829935567 +1937.807751149183 1911.5828102456112 1923.676307918832 1934.4279819781618 @@ -10178,7 +10179,7 @@ 1923.6971313843694 1914.7503506572486 1958.747533690674 -1917.7393715934418 +1918.0059649038217 1909.3267326347955 1982.6082963993645 1963.8563866041861 @@ -11287,7 +11288,7 @@ 1930.5405282461093 1855.5220302615335 1952.8298961854482 -1893.2863705412592 +1892.6402597321664 1928.501989244598 1967.1929378220514 1911.8069275316302 @@ -13417,7 +13418,7 @@ 2002.918285090558 1907.5419986831682 1982.5755707544743 -1844.5958529536363 +1844.823717363196 1940.7293100159388 1939.6370593406318 1910.3139645164986