ver 2 10000 train

This commit is contained in:
Łukasz Jędyk 2021-05-12 21:22:41 +02:00
parent a2d5d926d0
commit 9dfe8fe3b5
3 changed files with 29 additions and 22 deletions

View File

@ -2399,7 +2399,7 @@
1882.9509376976243
2006.2831425999086
1901.8236927643368
1845.8431605627075
1846.2982848835586
1907.4807609148313
1919.1770226218323
1808.6714849322975
@ -4770,7 +4770,7 @@
1869.227930644483
1906.3844663610223
1945.3424395378358
1897.9459278201575
1897.8191028561196
1902.4248382081591
1916.941205362806
1951.3650770196946
@ -6336,7 +6336,8 @@
1974.3692055493452
1851.1544441272165
1837.2440449549101
1921.004623698161
1906.469703262039
1941.217890583569
1863.050896361201
1935.4381446070022
1838.6591222095797
@ -11780,7 +11781,7 @@
1971.261305356721
1965.7895878732024
1894.1926433945762
1843.5769117112493
1843.4054815753093
1798.5510049748643
1874.6762363908592
1933.9946144817523
@ -11868,7 +11869,7 @@
1933.1659502110472
1948.6097367533557
1892.0990307008867
1881.7539718505154
1882.4301864075958
1898.8163564591582
1891.7720648916354
1899.1896641539054
@ -13607,7 +13608,8 @@
1899.4677481479275
1832.0157188177106
1830.994476738317
1859.8255318357274
1872.857763141983
1887.1141759579975
1821.8836244133079
1913.4663037228643
1985.0371631137966
@ -14150,7 +14152,7 @@
1958.9030832469919
1917.5848215968706
1905.0147395763986
1871.6714142048124
1871.3321978094336
1901.0391754660595
1871.966900926708
1965.8106578756367
@ -17368,7 +17370,7 @@
1863.7139407495256
1980.5486905106713
1820.0882670928129
1856.3160414564259
1855.6257391640597
1921.6397748707468
1970.804687384616
1902.8181073379312
@ -19795,7 +19797,7 @@
1917.3208523541055
1850.9065744029447
1912.2333094584328
1837.7607134713
1837.590902324513
1969.993000236959
1956.0008619219907
1941.4140218594555

1 1840.191355498014
2399 1882.9509376976243
2400 2006.2831425999086
2401 1901.8236927643368
2402 1845.8431605627075 1846.2982848835586
2403 1907.4807609148313
2404 1919.1770226218323
2405 1808.6714849322975
4770 1869.227930644483
4771 1906.3844663610223
4772 1945.3424395378358
4773 1897.9459278201575 1897.8191028561196
4774 1902.4248382081591
4775 1916.941205362806
4776 1951.3650770196946
6336 1974.3692055493452
6337 1851.1544441272165
6338 1837.2440449549101
6339 1921.004623698161 1906.469703262039
6340 1941.217890583569
6341 1863.050896361201
6342 1935.4381446070022
6343 1838.6591222095797
11781 1971.261305356721
11782 1965.7895878732024
11783 1894.1926433945762
11784 1843.5769117112493 1843.4054815753093
11785 1798.5510049748643
11786 1874.6762363908592
11787 1933.9946144817523
11869 1933.1659502110472
11870 1948.6097367533557
11871 1892.0990307008867
11872 1881.7539718505154 1882.4301864075958
11873 1898.8163564591582
11874 1891.7720648916354
11875 1899.1896641539054
13608 1899.4677481479275
13609 1832.0157188177106
13610 1830.994476738317
13611 1859.8255318357274 1872.857763141983
13612 1887.1141759579975
13613 1821.8836244133079
13614 1913.4663037228643
13615 1985.0371631137966
14152 1958.9030832469919
14153 1917.5848215968706
14154 1905.0147395763986
14155 1871.6714142048124 1871.3321978094336
14156 1901.0391754660595
14157 1871.966900926708
14158 1965.8106578756367
17370 1863.7139407495256
17371 1980.5486905106713
17372 1820.0882670928129
17373 1856.3160414564259 1855.6257391640597
17374 1921.6397748707468
17375 1970.804687384616
17376 1902.8181073379312
19797 1917.3208523541055
19798 1850.9065744029447
19799 1912.2333094584328
19800 1837.7607134713 1837.590902324513
19801 1969.993000236959
19802 1956.0008619219907
19803 1941.4140218594555

View File

@ -1,3 +1,4 @@
import csv
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
@ -6,8 +7,8 @@ from sklearn.feature_extraction.text import TfidfVectorizer
col_names = ['start_date', 'end_date', 'title', 'source', 'content']
train_set = pd.read_table('train/train.tsv.xz', error_bad_lines=False, header=None, names=col_names)
dev_set = pd.read_table('dev-0/in.tsv', error_bad_lines=False, header=None, names=col_names[4:])
test_set = pd.read_table('test-A/in.tsv', error_bad_lines=False, header=None, names=col_names[4:])
dev_set = pd.read_table('dev-0/in.tsv', error_bad_lines=False, header=None, names=col_names[4:], quoting=csv.QUOTE_NONE,)
test_set = pd.read_table('test-A/in.tsv', error_bad_lines=False, header=None, names=col_names[4:], quoting=csv.QUOTE_NONE,)
train_set = train_set.head(10000)
@ -17,6 +18,9 @@ y_train = (train_set['start_date'] + train_set['end_date']) / 2
X_dev = dev_set['content']
X_test = test_set['content']
print(dev_set)
print(test_set)
print('Trenowanie modelu...')
model = make_pipeline(TfidfVectorizer(), LinearRegression())
model.fit(X_train, y_train)

View File

@ -206,7 +206,7 @@
1928.6587695672113
1990.9156424576067
1881.4836785253315
1884.4519979671402
1884.046089520049
1877.4040287725156
1911.8947608148899
1899.7628344836432
@ -961,7 +961,7 @@
1920.806099260425
1934.7567428867874
2000.6897694955856
1914.538168494774
1915.0424831770722
1961.5348637593509
1976.0306055365545
1990.438972379044
@ -1458,7 +1458,7 @@
1897.0381253809762
1898.2169614742145
1907.1817765399285
1899.7781849186872
1900.5051189207925
1916.2254375826196
1887.7589605584237
1900.9036082026578
@ -1944,7 +1944,7 @@
1932.4369032498905
1924.7608021483702
1932.6665753305242
1910.2637014903585
1911.1136741658142
1844.3306216770322
1920.9735284228261
1898.4191575441814
@ -3448,7 +3448,8 @@
1985.828576031535
1929.0980372984723
1963.416770426545
1983.403262556507
1958.0061961875651
1979.5713424319551
1927.054394299714
1887.5844594202247
1952.952473384423
@ -5241,7 +5242,7 @@
1961.7395028010487
1911.578177275467
1993.361626034721
1860.938964267848
1861.582235755756
1880.8873010116863
1891.0584427244678
1927.9286642737504
@ -6326,7 +6327,7 @@
1894.4410835153267
1961.8656695163497
1918.6540411919113
1902.5162709462134
1901.772008274914
1970.120121368801
1958.8069602196501
1822.1082097880283
@ -8450,7 +8451,7 @@
1937.635613281464
1931.6145334405564
1911.15006803172
1937.8085829935567
1937.807751149183
1911.5828102456112
1923.676307918832
1934.4279819781618
@ -10178,7 +10179,7 @@
1923.6971313843694
1914.7503506572486
1958.747533690674
1917.7393715934418
1918.0059649038217
1909.3267326347955
1982.6082963993645
1963.8563866041861
@ -11287,7 +11288,7 @@
1930.5405282461093
1855.5220302615335
1952.8298961854482
1893.2863705412592
1892.6402597321664
1928.501989244598
1967.1929378220514
1911.8069275316302
@ -13417,7 +13418,7 @@
2002.918285090558
1907.5419986831682
1982.5755707544743
1844.5958529536363
1844.823717363196
1940.7293100159388
1939.6370593406318
1910.3139645164986

1 1975.9273818887748
206 1928.6587695672113
207 1990.9156424576067
208 1881.4836785253315
209 1884.4519979671402 1884.046089520049
210 1877.4040287725156
211 1911.8947608148899
212 1899.7628344836432
961 1920.806099260425
962 1934.7567428867874
963 2000.6897694955856
964 1914.538168494774 1915.0424831770722
965 1961.5348637593509
966 1976.0306055365545
967 1990.438972379044
1458 1897.0381253809762
1459 1898.2169614742145
1460 1907.1817765399285
1461 1899.7781849186872 1900.5051189207925
1462 1916.2254375826196
1463 1887.7589605584237
1464 1900.9036082026578
1944 1932.4369032498905
1945 1924.7608021483702
1946 1932.6665753305242
1947 1910.2637014903585 1911.1136741658142
1948 1844.3306216770322
1949 1920.9735284228261
1950 1898.4191575441814
3448 1985.828576031535
3449 1929.0980372984723
3450 1963.416770426545
3451 1983.403262556507 1958.0061961875651
3452 1979.5713424319551
3453 1927.054394299714
3454 1887.5844594202247
3455 1952.952473384423
5242 1961.7395028010487
5243 1911.578177275467
5244 1993.361626034721
5245 1860.938964267848 1861.582235755756
5246 1880.8873010116863
5247 1891.0584427244678
5248 1927.9286642737504
6327 1894.4410835153267
6328 1961.8656695163497
6329 1918.6540411919113
6330 1902.5162709462134 1901.772008274914
6331 1970.120121368801
6332 1958.8069602196501
6333 1822.1082097880283
8451 1937.635613281464
8452 1931.6145334405564
8453 1911.15006803172
8454 1937.8085829935567 1937.807751149183
8455 1911.5828102456112
8456 1923.676307918832
8457 1934.4279819781618
10179 1923.6971313843694
10180 1914.7503506572486
10181 1958.747533690674
10182 1917.7393715934418 1918.0059649038217
10183 1909.3267326347955
10184 1982.6082963993645
10185 1963.8563866041861
11288 1930.5405282461093
11289 1855.5220302615335
11290 1952.8298961854482
11291 1893.2863705412592 1892.6402597321664
11292 1928.501989244598
11293 1967.1929378220514
11294 1911.8069275316302
13418 2002.918285090558
13419 1907.5419986831682
13420 1982.5755707544743
13421 1844.5958529536363 1844.823717363196
13422 1940.7293100159388
13423 1939.6370593406318
13424 1910.3139645164986