From b0557d534e22cdc3d3b094e36dea3f2ed7fb4f6c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krystian=20Osi=C5=84ski?= Date: Tue, 23 Apr 2024 20:45:56 +0200 Subject: [PATCH 1/2] add lab8 --- lab/lab_08.ipynb | 132 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 115 insertions(+), 17 deletions(-) diff --git a/lab/lab_08.ipynb b/lab/lab_08.ipynb index a98e197..fb15b96 100644 --- a/lab/lab_08.ipynb +++ b/lab/lab_08.ipynb @@ -88,7 +88,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 9, "id": "1e80adcf-ac34-4c38-a2c2-5735985c963e", "metadata": {}, "outputs": [ @@ -106,7 +106,7 @@ "0.7476897494228967" ] }, - "execution_count": 1, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -395,7 +395,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "49c68adb-f242-434a-94e0-8236bb944e1b", "metadata": {}, "outputs": [ @@ -409,26 +409,103 @@ } ], "source": [ - "import PyDictionary \n", - "dictde = PyDictionary.PyDictionary() \n", - " \n", - "translation = dictde.translate(\"happy\",'de') \n", - "print(translation)" + "\n" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "descending-easter", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n", + "Invalid Word\n" + ] + } + ], "source": [ "from PyDictionary import PyDictionary\n", - "\n", - "def translate(word):\n", - " dictionary = PyDictionary()\n", - " translation = dictionary.translate(word, 'de')\n", - " return translation\n", + "#Szanowny Panie, na prawdę starałem się wykonać to zadanie. zauważyłem, że przy pierwszym odpaleniu to działało.\n", + "#teraz jednak nie chce się odpalić. TO samo miałem na prywatnym komputerze, restart również nie pomógł. Ponoć powodem\n", + "#jest fakt, że biblioteka ta używa goole translate, a goole z tym walczy\n", "\n", "def analyze_translations():\n", " dictionary = PyDictionary()\n", @@ -437,9 +514,30 @@ " words = []\n", " for word in sentence:\n", " words.append(dictionary.translate(word,'de'))\n", - " result.append(words)\n", - "\n" + " result.append(words)\n", + " return result\n", + "\n", + "x = 0\n", + "y = 0\n", + "#wordSET = analyze_translations() \n", + "for i in len(wordSET) - 1:\n", + " for word in wordSET[i]:\n", + " if word in HUMAN[i][0]:\n", + " x += 1\n", + " if word in MACHINE[i]:\n", + " y += 1\n", + "\n", + "print(x)\n", + "print(y) " ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ff84873c-9f1a-4a2b-b31b-b10a49ce65e6", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { From b0c9d9a3f2388cbf1d189e6d719da70f94d576b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krystiab=20Osi=C5=84ski?= Date: Thu, 30 May 2024 16:26:42 +0200 Subject: [PATCH 2/2] add lab 9-11 --- lab/lab_09-10.ipynb | 1424 +++++++++++++++++++++++++++++++++++++++++-- lab/lab_11.ipynb | 339 +++++++++- 2 files changed, 1680 insertions(+), 83 deletions(-) diff --git a/lab/lab_09-10.ipynb b/lab/lab_09-10.ipynb index b31e24d..3fafb4e 100644 --- a/lab/lab_09-10.ipynb +++ b/lab/lab_09-10.ipynb @@ -42,7 +42,25 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, + "id": "13f6d2cc-d03f-4805-8a9a-69bce9c02f67", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/bin/bash: line 1: playwright: command not found\n" + ] + } + ], + "source": [ + "pip install playwright" + ] + }, + { + "cell_type": "code", + "execution_count": 2, "id": "revolutionary-trust", "metadata": {}, "outputs": [ @@ -50,37 +68,36 @@ "name": "stdout", "output_type": "stream", "text": [ - "Nastolatek ukradł znajomemu 4500 złotych. Wcześniej pił z nim alkohol\n", - "Czekają nas kolejne podwyżki rachunków. Tym razem za ogrzewanie i ciepłą wodę\n", - "Nie żyje Piotr Ś. Czyściciel kamienic miał 47 lat\n", - "Maciej Skorża nie zmienił zdania o systemie na mecz z Rakowem. Kolejorz ma szybką okazję do rehabilitacji\n", - "Kto zabił Kazimierę Kurkowiak? Poznańskie Archiwum X wraca do sprawy sprzed 30 lat\n", - "Mieszkańcy osiedla Kwiatowego zyskają nowy chodnik\n", - "Poznańskie ZOO ponownie się otwiera i apeluje o kupowanie biletów online\n", - "1700 zł mandatu dla motocyklisty: nie ma prawa jazdy, jechał za szybko\n", - "Plac Wolności ma tętnić życiem. Jest koncepcja zagospodarowania\n", - "Dzikie wysypisko w Wielkopolskim Parku Narodowym, a w nim paczka z telefonem odbiorcy\n", - "Dobre wieści z Łazarza! \"Zielona Perła\" sprzedana!\n", - "Sokoły wędrowne w gnieździe na kominie poznańskiej elektrociepłowni! Są 4 młode\n", - "720 nowych zakażeń w Wielkopolsce\n", - "Uderzył kobietę w sklepie: \"sprawca będzie rozliczony\"\n", - "Zespół Szkół Geodezyjno- Drogowych. Przyszłość rysuje się w kolorowych barwach!\n", - "Tajemniczy wypadek i pożar pod Kwilczem. Auto spłonęło, w środku nikogo nie było\n", - "Nad Jeziorem Maltańskim powstanie duży hotel? \"Ma uzupełniać infrastrukturę sportową\"\n", - "Śmiertelny wypadek na trasie S8: samochód potrącił rowerzystę\n", - "Specjaliści o poszukiwaniu Natalii Lick: \"niestety trop psa prowadził na Wartostradę\"\n", - "Korki przy skrzyżowaniu Grochowska / Grunwaldzka: ruszyły prace!\n", - "Restauracja w Kaliszu przyjmuje klientów: sanepid i policja \"odwiedzili\" lokal\n", - "Ile kosztuje wywóz odpadów?\n", - "Dachowanie auta na trasie Konin - Turek\n", - "Kierowca BMW pod wpływem narkotyków, pasażer w ich posiadaniu. Obaj zostali zatrzymani\n", - "Leszno: mężczyzna uderzył klientkę sklepu. Poszło o maseczkę?\n", - "Od poniedziałku zapłacimy za parkowanie na kolejnych ulicach\n", - "Włamał się do obiektu handlowego. Grozi mu nawet 15 lat więzienia\n", - "Rondo Śródka: kolizja z udziałem dwóch pojazdów\n", - "Europoseł PSL: oświadczenie Episkopatu ma wpływ na proces szczepień. \"Bardzo dużo ludzi zrezygnowało\"\n", - "Bezcenna wygrana Enea Energetyka. Poznanianki zagrają w fazie play-off\n", - "No to w drogę! Po odmienionych trasach w Wielkopolsce\n" + "Lech przegrał Koroną. Na trybunach marsz żałobny i 'mamy k**** dość'\n", + "Warta Poznań po przegranej z Jagielonią Białystok spada do I ligi\n", + "Mieszkańcy skarżą się na właściciela samochodu, w którym notorycznie włącza się alarm. \"Uprzykrza nam to życie!\"\n", + "Śmiertelny wypadek w Wielkopolsce. Nie żyje młoda kobieta\n", + "Leśne Placówki Montessori\n", + "Na autostradzie samochód wpadł w poślizg i stanął w poprzek. Są spore utrudnienia\n", + "Wróciła plaga kradzieży katalizatorów. Zmora dla kierowców, którzy nie mogą garażować auta\n", + "Nowy basen w Kiekrzu? W tunelu wody przybyło po same kolana\n", + "Pierożki Dim Sum z Para Bar Rataje ze specjalną zniżką!\n", + "Fałszywy pożar w centrum Poznania. Kłęby dymu w kamienicy?\n", + "Uwaga. Utrudnienia na drodze i ograniczenie prędkości. Potrwa to około 5 godzin\n", + "Chcą pobić rekord w kręceniu lodów. Tona lodów w ciągu doby\n", + "Nowe Centrum Medyczne Bizpark już w sprzedaży. Znajdź idealny lokal pod swoją działalność medyczną\n", + "Rondo Obornickie: zderzenie samochodu z motocyklem. Poszkodowany został odwieziony do szpitala. Chwilowe utrudnienia\n", + "Policjanci publikują wizerunek i szukają tego mężczyzny\n", + "Grupa Stonewall będzie miała program na antenie TVP3 Poznań. \"To będzie odtrutka na lata dezinformacji\"\n", + "Ruszył remont ważnego mostu. Co z kłódkami zakochanych?\n", + "Mieszkaniec spotkał wilka w Poznaniu?\n", + "Włamanie do... lokomotywy\n", + "W nadwarciański krajobraz wpisały się... żurawie. \"Jeden jest największy na świecie\"\n", + "Robisz remont? Za to możesz słono zapłacić!\n", + "Agresywny mężczyzna zaatakował strażaków. Miał \"dwa noże oraz gaz łzawiący\"\n", + "Rower dla nastolatka - wyzwanie dla rodzica. MTB, Dirt czy BMX?\n", + "Wypadek z udziałem dziecka w Poznaniu\n", + "Ulewa nie przeszkadza studentom. Zabawa trwa!\n", + "Mąka musi zniknąć ze sklepowych półek. Masz ją w domu?\n", + "Wiatr zrywa dachy w Wielkopolsce. Strażacy odebrali już ponad 140 zgłoszeń\n", + "MPK Poznań testuje kolejny \"wodorowiec\". Wiadomo, na jakich liniach go spotkamy\n", + "Najnowsze trendy edukacyjne - żłobek, przedszkole i szkoła w OGRODZIE\n", + "Uszkodzona sieć trakcyjna. Pociągi notują duże opóźnienia!\n" ] } ], @@ -108,13 +125,84 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, + "id": "85551a51-99af-439f-9a94-2ff2bcb2d0ad", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/bin/bash: line 1: playwright: command not found\n" + ] + } + ], + "source": [ + "!playwright install" + ] + }, + { + "cell_type": "code", + "execution_count": 49, "id": "moving-clothing", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['Lodówka Bosch Serie 6 KGN39LBCF z zamrażalnikiem dolnym 203 cm Czarna',\n", + " 'Lodówka Bosch Serie 4 KGN392LCF z zamrażalnikiem dolnym 203 cm Srebrna',\n", + " 'Lodówka MPM 81-CJH-23/E jednodrzwiowa 122 cm Biała',\n", + " 'Lodówka Amica FK244.4X z zamrażalnikiem dolnym 144 cm Srebrna',\n", + " 'Lodówka MPM 182-KB-33/AA z zamrażalnikiem dolnym 142,2 cm Szara',\n", + " 'Lodówka Samsung Bespoke RB38C7B6AB1 z zamrażalnikiem dolnym 203 cm Czarna',\n", + " 'Lodówka Candy CCG1S 518EW z zamrażalnikiem dolnym 179,3 cm Biała',\n", + " 'Lodówka Samsung RB33B612FBN z zamrażalnikiem dolnym 185,3 cm Czarna',\n", + " 'Lodówka Bosch KGN36VI20 z zamrażalnikiem dolnym Srebrna',\n", + " 'Lodówka Beko RCSA300K40WN z zamrażalnikiem dolnym 181,3 cm Biała',\n", + " 'Lodówka Candy CCT3L517FW z zamrażalnikiem dolnym 176 cm Biała',\n", + " 'Lodówka LG GBB62SWGGN z zamrażalnikiem dolnym 203 cm Biała',\n", + " 'Lodówka LG GBP31DSLZN DoorCooling+ z zamrażalnikiem dolnym 186 cm Szara',\n", + " 'Lodówka Gorenje RF414EPS4 z zamrażalnikiem górnym 143,6 cm Szara',\n", + " 'Lodówka Vivax DD-207S z zamrażalnikiem górnym 143 cm Szara',\n", + " 'Lodówka MPM 324-KB-35/AA z zamrażalnikiem dolnym 185 cm Srebrna',\n", + " 'Lodówka Bosch Serie 4 KGN362WDF z zamrażalnikiem dolnym 186 cm Biała',\n", + " 'Lodówka Amica FD2015.4 z zamrażalnikiem górnym 122 cm Biała',\n", + " 'Lodówka Gorenje RK4182PS4 z zamrażalnikiem dolnym 180 cm Srebrna',\n", + " 'Lodówka Amica FM126.4(E) jednodrzwiowa 85 cm Biała',\n", + " 'Lodówka Hisense RB390N4BFC z zamrażalnikiem dolnym 186 cm Czarna',\n", + " 'Lodówka Gorenje R619EEW5 jednodrzwiowa 185 cm Biała',\n", + " 'Lodówka Bosch Seria 2 KGN33NLEB z zamrażalnikiem dolnym 176 cm',\n", + " 'Lodówka Beko TS190340N jednodrzwiowa 81,8 cm Biała',\n", + " 'Lodówka Beko RCSA270K40SN z zamrażalnikiem dolnym 170,8 cm Szara',\n", + " 'Lodówka Samsung RB33B612ESA z zamrażalnikiem dolnym 185,3 cm Srebrna',\n", + " 'Lodówka Polar POB601EW z zamrażalnikiem dolnym 159 cm Biała',\n", + " 'Lodówka Gorenje RK4181PW4 z zamrażalnikiem dolnym 180 cm Biała',\n", + " 'Lodówka MPM 108-KB-45 z zamrażalnikiem dolnym 114 cm Czarna',\n", + " 'Lodówka Sigma BC-45 Czarny jednodrzwiowa 85 cm Czarna',\n", + " 'Lodówka MPM 215-KB-38/E z zamrażalnikiem dolnym 150 cm Biała']" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ + "import requests\n", + "from bs4 import BeautifulSoup\n", + "\n", "def get_names(article_type):\n", - " return []" + " url=f'https://www.ceneo.pl/;szukaj-{article_type}'\n", + " page = requests.get(url)\n", + " soup = BeautifulSoup(page.content, 'html.parser')\n", + " \n", + " products = soup.find_all('strong', {'class':'cat-prod-row__name'})\n", + " products = [products.get_text().strip() for products in products]\n", + " #print(''.join([products.get_text() for products in products])) \n", + " return products\n", + " \n", + "get_names(\"lodowka\")" ] }, { @@ -135,13 +223,103 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 46, "id": "german-dispute", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[['Drukarka laserowa Brother HL-1222WE',\n", + " 'Drukarka laserowa Brother HL-1110E',\n", + " 'Brother DCP-1512E',\n", + " 'Brother MFC-L8690CDW',\n", + " 'Monitoring Domu 4 Kamery Ip DS-2DE2A204IW-DE3 PoE',\n", + " 'Vasco Translator V4 Stone Gray',\n", + " 'Imou - Zestaw Bezprzewodowego Monitoringu Wifi: 4 Kamery + Nvr Nvr1104Hs-W-S2/4-F22 (NVR1104HSWS24F22)',\n", + " 'Zestaw EasyCam Wi-Fi Kit/NVR4CH/4-3T3IR',\n", + " 'SJCAM A10 IP65 czarny',\n", + " 'Monitoring 8 Kamer 5 Mpx Zewnętrzne Dzień Noc Dysk',\n", + " 'Tapo Tp-Link C420S2 Kamera Do Monitoringu Zewnętrzna Wysoka Rozdzielczość 2K 180 Dni Pełnokolorowa Widoczność W Nocy Inteligentna Identyfikacja',\n", + " 'Samsung The Freestyle 2023 (SP-LFF3CLAXXXH)',\n", + " 'Ubiquiti Unifi Ai Dslr (UVCAIDSLR)',\n", + " 'Bosch UniversalBrush 06033E0000',\n", + " 'Karcher HD 5/15 C 1.520-930.0',\n", + " 'Tp Link Tapo C500',\n", + " 'Karcher SE 3-18 Compact Home 1.081-506.0',\n", + " 'Stihl Odkurzacz Na Sucho I Mokro Se 33',\n", + " 'Stanley Wet&Dry 1600W 30L SXVC30XTDE',\n", + " 'Optoma Uhd35X (E9PV7GL06EZ1)',\n", + " 'Karcher T 7/1 Classic 1.527-181.0',\n", + " 'Philips Neopix 110 Czarny (NPX110INT)',\n", + " 'Karcher NT 22/1 Ap L 1.378-600.0',\n", + " 'Karcher HD 8/23 G Classic 1.187-012.0',\n", + " 'Karcher HD 5/15 C Plus 1.520-931.0',\n", + " 'Karcher WPD 50 Ws 1.024-405.0',\n", + " 'Karcher SE 3-18 Compact 1.081-500.0',\n", + " 'Texas Instruments Graficzny Ti-Nspire Cx',\n", + " 'Karcher Puzzi 10/1 1.100-130.0',\n", + " 'Bosch GAS 12-25 PL Professional 060197C100'],\n", + " ['Casio DATABANK DBC-32D-1ADF',\n", + " 'Casio G-Shock GBD-200 -1ER',\n", + " 'Casio Edifice EFV-550P-1AVUEF',\n", + " 'G. GERLACH DYWIZJON 303',\n", + " 'Casio EFV-500D-1AVUEF',\n", + " 'ORIENT Mako Solar RA-TX0203S10B',\n", + " 'Zeppelin Ze-7640-1',\n", + " 'Tommy Hilfiger Watch Daniel 1710383',\n", + " 'Seiko 5 Snkd99K1',\n", + " 'Casio G-Shock GW-3000M-4AER',\n", + " 'Seiko 5 Automatic SNK357K1',\n", + " 'ZEGAREK SEIKO SPIRIT SBTQ045 CHRONO',\n", + " 'Orient Classic Automatic FAC00009N0',\n", + " 'Delbana Retro Chronograph 426016726064',\n", + " 'Casio Sport AE-1200WHD-1A',\n", + " 'Michael Kors Ritz Mk6356',\n", + " 'Aviator AVW6975G354',\n", + " 'Casio Edifice EFV-C110D-1A4VEF',\n", + " 'Casio G-Shock GA-2100 -1A3ER',\n", + " 'Orient Mako III Automatic Raaa0004E19B',\n", + " 'Adriatica Super De Luxe A8331.1251Q',\n", + " 'Casio LTP-1215A',\n", + " 'G. GERLACH KOSMONAUTA 9823UNIW',\n", + " 'Casio VINTAGE A168WEGG-1BEF',\n", + " 'Seiko SSB385P1',\n", + " 'Zeppelin Atlantic 84623',\n", + " 'Casio MTP-M305M-1AVER',\n", + " 'Casio MTP-1302PD -3AVEF',\n", + " 'Casio MTP-1302PD -2A2VEF',\n", + " 'Guess GW0118L1']]" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "def scrape_names():\n", - " return []" + "import requests\n", + "from bs4 import BeautifulSoup\n", + "\n", + "def scrape_names(numer_of_categories):\n", + " \n", + " url=f'https://www.ceneo.pl'\n", + " page = requests.get(url)\n", + " soup = BeautifulSoup(page.content, 'html.parser')\n", + " \n", + " categories = soup.find_all('a', {'class':'cat-menu-item__link'})\n", + " categories = [categories.get_text().strip() for categories in categories]\n", + " products = []\n", + " i = 0\n", + " for category in categories:\n", + " if i >= numer_of_categories:\n", + " break\n", + " products.append(get_names(category))\n", + " i += 1\n", + " \n", + " return products\n", + "\n", + "scrape_names(2)" ] }, { @@ -154,37 +332,29 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 74, "id": "premium-button", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " Yahoo Make Yahoo Your HomepageDiscover something new every day from News, Sports, Finance, Entertainment and more! HOME MAIL NEWS FINANCE SPORTS ENTERTAINMENT LIFE SHOPPING YAHOO PLUS MORE... Download the Yahoo Home app Yahoo Home Search query Sign in Mail Sign in to view your mail Mail Mail COVID-19 COVID-19 News News Finance Finance Sports Sports Entertainment Entertainment Life Life Shopping Shopping Yahoo Plus Yahoo Plus More... More... Follow live:Closing arguments begin for Derek Chauvin's murder trial in the death of George Floyd 5 people in hospital after shooting in Louisiana One victim was shot in the head, and another suffered multiple gunshot wounds, according to local news outlet.Multiple police units dispatched to scene »2 dead in crash of Tesla with 'no one' drivingMall shooter, 16, faces 1st-degree murder charge'80s pop star rips 'Simpsons' for 'hateful' parodyConspiracy theorist Alex Jones faces a reckoningPig's head left at former home of Chauvin trial witness U.S.HuffPostFirst-Ever Wild Wolf Collar Camera Shows What They Really Do All Day LongThis canine's favorite meal might surprise you. Thanks for your feedback! CelebrityThe TelegraphRobert De Niro unable to turn down acting roles because of his ‘estranged wife's expensive lifestyle’Hollywood legend Robert De Niro is unable to turn down acting roles because he must pay for his estranged wife's expensive tastes, the actor's lawyer has claimed. Caroline Krauss told a Manhattan court that he is struggling financially because of the pandemic, a massive tax bill and the demands of Grace Hightower, who filed for divorce in 2018 after 21 years of marriage. The court has been asked to settle how much De Niro should pay Ms Hightower, 66, until the terms of the prenuptial agreement the couple negotiated in 2004 takes effect. “Mr De Niro is 77 years old, and while he loves his craft, he should not be forced to work at this prodigious pace because he has to,” Ms Krauss told the court. “When does that stop? When does he get the opportunity to not take every project that comes along and not work six-day weeks, 12-hour days so he can keep pace with Ms Hightower’s thirst for Stella McCartney?” Thanks for your feedback! U.S.Associated PressCouple: Man has tossed used cups in their yard for 3 yearsAn upstate New York couple may have finally solved the mystery of who's been tossing used coffee cups in their front yard for nearly three years. Edward and Cheryl Patton told The Buffalo News they tried mounting a camera in a tree in front of their home in Lake View to catch the phantom litterer. After Edward Patton called police, they waited and pulled over a vehicle driven by 76-year-old Larry Pope, who Cheryl Patton said had once worked with her and had had disagreements with her over union issues. Thanks for your feedback! U.S.INSIDERA leading conspiracy theorist who thought COVID-19 was a hoax died from the virus after hosting illegal house partiesA high-profile conspiracy theorist from Norway, who shared false information about the pandemic online, has died from COVID-19, officials say. Thanks for your feedback! PoliticsThe WeekOne America News Network producer says 'majority' of employees didn't believe reports on voter fraud claimsMarty Golingan, a producer at One America News Network, a right-wing cable news channel often noted for its affinity for former President Donald Trump, told The New York Times he was worried his work may have helped inspire the Jan. 6 Capitol riot. At one point during the incident, Golingan said he caught sight of someone in the mob holding a flag with OAN's logo. \"I was like, OK, that's not good. That's what happens when people listen to us,\" he told the Times, referring to OAN's coverage of the 2020 presidential election, which often gave credence to Trump's unfounded claims of widespread voter fraud and Democratic conspiracies. Golingan said that many of his colleagues, including himself, disagreed with the coverage. \"The majority of people did not believe the voter fraud claims being run on the air,\" he told the Times. Indeed, the Times interviewed 18 current and former OAN employees, 16 of whom said the channel has \"broadcast reports that they considered misleading, inaccurate, or untrue.\" But Allysia Britton, a former producer and one of more than a dozen employees to leave OAN in the wake of the riot, explained that while \"many people have raised concerns ... when people speak up about anything, you will get in trouble.\" Read more at The New York Times. More stories from theweek.comThe new HBO show you won't be able to stop watchingDonald Trump's most dangerous political legacyTrump's NSA general counsel Michael Ellis resigns, never having taken office Thanks for your feedback! BusinessMoneyWiseFourth stimulus check update: Biden faces mounting pressure for new paymentAdvocates and lawmakers say the crisis isn't over, and neither is the need for relief. Thanks for your feedback! CelebrityThe TelegraphLand Rover driver at Prince Philip's funeral spent week ensuring he could drive at correct speedHuffPostPrince Philip's Funeral, In PhotosUSA TODAY EntertainmentWhy did Prince Philip's Land Rover carry his casket? The story behind the strange hearse Thanks for your feedback! Trending Now1. Gianna Hammer2. Derek Chauvin3. Black Rob4. 2021 Acm Awards5. Baby Shower Invitations6. Amanda Broderick7. Mortgage Refinance Calculator8. Interest Rates Today9. Tesla Crash10. Mars Helicopter Yahoo! Mail WeatherWeatherGreater PolandView your LocationsRemove from favorite locationsDetect my locationEnter City or ZipcodeManage LocationsToday66°45°TueRain today with a high of 59 °F (15.0 °C) and a low of 41 °F (5.0 °C). There is a 50% chance of precipitation.59°41°WedPartly cloudy today with a high of 57 °F (13.9 °C) and a low of 41 °F (5.0 °C).57°41°ThuScattered showers today with a high of 48 °F (8.9 °C) and a low of 37 °F (2.8 °C). There is a 35% chance of precipitation.48°37°See More » ScoreboardChange Sports to display different scoresNBA NFL MLB NHL NCAAB NCAAF Trending YesterdayTodayTomorrowPortland Charlotte 101109FinalSacramento Dallas 121107FinalMinnesota LA Clippers 105124FinalMore scores » HoroscopeChange your horoscope signAriesTaurusGeminiCancerLeoVirgoLibraScorpioSagittariusCapricornAquariusPiscesApril 19 -Aries - You're feeling the heat, and you may find that your friends like it as much as you do! Your great energy is perfect for almost any activity, so light up the night and have a great time! See more » Yahoo! Mail Yahoo! Sports Terms (Updated)Privacy (Updated)AdvertiseAbout Our AdsCareersHelpFeedback Close this content, you can also use the Escape key at anytime \n" - ] - } - ], + "outputs": [], "source": [ "import re\n", "\n", - "url = \"https://www.yahoo.com\"\n", + "def get_text(url):\n", "\n", - "page = requests.get(url)\n", - "soup = BeautifulSoup(page.content, 'html.parser')\n", + " page = requests.get(url)\n", + " soup = BeautifulSoup(page.content, 'html.parser')\n", "\n", - "# usunięcie elementów script i style\n", - "for script in soup([\"script\", \"style\"]):\n", - " script.extract() # usuń element\n", + " # usunięcie elementów script i style\n", + " for script in soup([\"script\", \"style\"]):\n", + " script.extract() # usuń element\n", "\n", - "# pobierz tekst\n", - "text = soup.get_text()\n", + " # pobierz tekst\n", + " text = soup.get_text()\n", "\n", - "# usuń wielokrotne białe znaki\n", - "text = re.sub(r\"\\s+\", \" \", text)\n", + " # usuń wielokrotne białe znaki\n", + " text = re.sub(r\"\\s+\", \" \", text)\n", "\n", - "print(text)" + " return(text)" ] }, { @@ -197,13 +367,50 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 77, "id": "regulation-sheriff", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "' RSS WMiI – Rada Samorządu Studentów Wydziału Matematyki i Informatyki Strona główna O nas Kontakt Studenci 1. roku BIP Starości Koła Naukowe WMiI Szukaj: Szukaj: Szukaj: RSS WMiI Rada Samorządu Studentów Wydziału Matematyki i Informatyki Facebook RSS WMiIInstagram RSS WMiI Przejdź do treści Strona główna O nas Kontakt Studenci 1. roku BIP Starości Koła Naukowe WMiI Szukaj: Szukaj: Szukaj: RSS WMiIRada Samorządu Studentów Wydziału Matematyki i Informatyki Previous Slide Next Slide Poznajmy się bliżej Kontakt Sprawdź! \"Kontakt\" O nas Sprawdź! \"O nas\" Studenci 1. roku Sprawdź! \"Studenci 1. roku\" Odwiedź również: Samorząd Studentów UAM Wydział Matematyki i Informatyki UAM Szukaj: Szukaj: Szybki kontakt: samorzad@wmi.amu.edu.pl pok. B-2, WMiI UAM, Uniwersytetu Poznańskiego 4, 61-614 Poznań Powrót na górę Facebook RSS WMiIInstagram RSS WMiI©2020 Samorząd WMIOparte na Anima & WordPress. Skip to content Open toolbar Dostępność witryny Zwiększ tekst Zmniejsz tekst Wysoki kontrast Negatywny kontrast Jasne tło Links Underline Czytelna czcionka Reset '" + ] + }, + "execution_count": 77, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "def scrape_wmi():\n", - " return []" + "import re\n", + "def scrape_wmi(x):\n", + " \n", + "\n", + " url = \"https://wmi.amu.edu.pl\"\n", + "\n", + " page = requests.get(url)\n", + " soup = BeautifulSoup(page.content, 'html.parser')\n", + "\n", + " links = set()\n", + " link = \"\"\n", + " \n", + " for a in soup.find_all('a', href=True):\n", + " link = a['href']\n", + " if link[0] == '#' or 'mailto:' in link or 'tel:' in link:\n", + " continue\n", + " else:\n", + " links.add(link)\n", + " \n", + " websites_text = []\n", + " \n", + " for link in links:\n", + " websites_text.append(get_text(link))\n", + " \n", + " \n", + " return websites_text[x]\n", + "\n", + "scrape_wmi(1)" ] }, { @@ -229,9 +436,1094 @@ "metadata": {}, "outputs": [], "source": [ - "def scrape_shqip():\n", - " return []" + "NIEOPTYMALNA_WERSJA = 0\n", + "# import requests\n", + "# from bs4 import BeautifulSoup\n", + "\n", + "# def scrape_shqip():\n", + "# ranges = [\n", + "# \"0-1000\",\n", + "# # \"1000-2000\",\n", + "# # \"2000-3000\",\n", + "# # \"3000-4000\",\n", + "# # \"4000-5000\",\n", + "# # \"5000-6000\"\n", + "# ]\n", + " \n", + "# sq_words = []\n", + "# url_main = \"https://glosbe.com/topwords/en/sq/\"\n", + "# url_word = \"https://glosbe.com/en/sq/\"\n", + "# for r in ranges:\n", + "# url_main += r\n", + "# page = requests.get(url_main)\n", + "# soup = BeautifulSoup(page.content, 'html.parser')\n", + "# categories = soup.find_all('li', {'class':'mb-4'})\n", + "# categories = [categories.get_text().strip().partition('\\n')[-1] for categories in categories]\n", + "# for category in categories:\n", + "# url_word += category\n", + "# page = requests.get(url_word)\n", + "# soup = BeautifulSoup(page.content, 'html.parser')\n", + "# words = soup.find_all('h3', {'lang':'sq'})\n", + "# words = [words.get_text().strip() for words in words]\n", + "# sq_words.append(words)\n", + " \n", + "# return sq_words\n", + "\n", + "# scrape_shqip()" ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "3cb18097-f09c-4717-8717-692c60bb4b2b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[['poezi',\n", + " 'ckemi',\n", + " 'gjuha angleze',\n", + " 'urime ditëlindjen',\n", + " 'shkruaj',\n", + " 'fjalë',\n", + " 'alfabeti',\n", + " 'si thuhet ... në anglisht',\n", + " 'lukthi i njeriut',\n", + " 'translate',\n", + " 'shqiptimi',\n", + " 'thana',\n", + " 'përralla',\n", + " 'ese',\n", + " 'histori e shkurtër',\n", + " 'rrofsh',\n", + " 'dallëndyshe',\n", + " 'dialog',\n", + " 'perkthe',\n", + " 'gjethe',\n", + " 'gëlqere',\n", + " 'pyetje',\n", + " 'ngjyra',\n", + " 'lexoj',\n", + " 'hurma',\n", + " 'lexim',\n", + " 'fjale të urta',\n", + " 'xixëllonjë',\n", + " 'ora',\n", + " 'fjali',\n", + " 'gramatika',\n", + " 'tregim',\n", + " 'iriqi',\n", + " 'tekst',\n", + " 'faleminderit',\n", + " 'kanella',\n", + " 'kartolinë',\n", + " 'përshkrim',\n", + " 'une te dua ty',\n", + " 'shprehje',\n", + " 'ftua',\n", + " 'kumbulla',\n", + " 'kalendari',\n", + " 'si je',\n", + " 'letër',\n", + " 'te dua',\n", + " 'shkrim',\n", + " 'kungull',\n", + " 'libri',\n", + " 'emër',\n", + " 'këmishë',\n", + " 'tema',\n", + " 'koha e tashme',\n", + " 'liber',\n", + " 'mënyra kushtore',\n", + " 'shega',\n", + " 'kafshë',\n", + " 'bisedë',\n", + " 'mëlçia',\n", + " 'niseshte',\n", + " 'borzilok',\n", + " 'bamje',\n", + " 'bajame',\n", + " 'lajthi',\n", + " 'vërtetim',\n", + " 'sa është ora?',\n", + " 'rrush',\n", + " 'kali',\n", + " 'shkronja',\n", + " 'miell',\n", + " 'histori',\n", + " 'urith',\n", + " 'Projekt',\n", + " 'lule',\n", + " 'uroj',\n", + " 'qershor',\n", + " 'majdanoz',\n", + " 'lidhëza',\n", + " 'teze',\n", + " 'Kurban Bajrami',\n", + " 'mjekësi',\n", + " 'Greqisht',\n", + " 'diagrami i Venit',\n", + " 'sharje',\n", + " 'kontratë',\n", + " 'gjel',\n", + " 'lepur',\n", + " 'gjizë',\n", + " 'ekonomik',\n", + " 'përshëndetje',\n", + " 'fjalëkalimi',\n", + " 'korrik',\n", + " 'presh',\n", + " 'struci',\n", + " 'panxhar',\n", + " 'kamarier',\n", + " 'syri',\n", + " 'veshje',\n", + " 'vjeshtë',\n", + " 'gështenjë',\n", + " 'grurë',\n", + " 'gaforre',\n", + " 'shkalla pohore',\n", + " 'byrek',\n", + " 'Hirushja',\n", + " 'mbiemër',\n", + " 'karkalec',\n", + " 'horoskopi',\n", + " 'ditar',\n", + " 'përshtatje',\n", + " 'shqiptim',\n", + " 'vizore',\n", + " 'midhje',\n", + " 'e bukur',\n", + " 'gjirafa',\n", + " 'kërmilli',\n", + " 'synet',\n", + " 'halla',\n", + " 'oxhak',\n", + " 'misri',\n", + " 'shkalla sipërore',\n", + " 'bletë',\n", + " 'intervistë',\n", + " 'dollap',\n", + " 'freskore',\n", + " 'ujk',\n", + " 'si jeni',\n", + " 'pershendetje',\n", + " 'Alfabeti Fonetik Ndërkombëtar',\n", + " 'hello',\n", + " 'molla',\n", + " 'tenxhere',\n", + " 'peshore',\n", + " 'ylberi',\n", + " 'kurve',\n", + " 'dreri',\n", + " 'ushqime',\n", + " 'fraza',\n", + " 'lakër',\n", + " 'detyra',\n", + " 'luaj',\n", + " 'balluke',\n", + " 'Bricjapi',\n", + " 'matematika',\n", + " 'skifter',\n", + " 'busull',\n", + " 'legjendë',\n", + " 'bollgur',\n", + " 'gramatikë',\n", + " 'breshkë',\n", + " 'dardhë',\n", + " 'Peshorja',\n", + " 'biçikleta',\n", + " 'lemza',\n", + " '-a',\n", + " 'po',\n", + " 'Turqisht',\n", + " 'mësim',\n", + " 'lopa',\n", + " 'mikroprocesor',\n", + " 'shtepi',\n", + " 'zorrë',\n", + " 'lakuriq nate',\n", + " 'ketri',\n", + " 'dashuri',\n", + " 'komunikimi',\n", + " 'dua',\n", + " 'ju bëftë mirë',\n", + " 'karafil',\n", + " 'mushkonja',\n", + " 'akronim',\n", + " 'qofte',\n", + " 'shprehje e kushtëzuar',\n", + " 'skuth',\n", + " 'mami',\n", + " 'heteroseksual',\n", + " 'syze',\n", + " 'mut',\n", + " 'mënyra urdhërore',\n", + " 'une',\n", + " 'raki',\n", + " 'zemër',\n", + " 'qukapik',\n", + " 'qepë',\n", + " 'fjalëkryq',\n", + " 'folje',\n", + " 'diell',\n", + " 'status',\n", + " 'zanore',\n", + " 'me',\n", + " 'Prezantimi i imazhit',\n", + " 'macja',\n", + " 'vishnjë',\n", + " 'shkronjë',\n", + " 'peshqir',\n", + " 'zemer',\n", + " 'bukur',\n", + " 'dafina',\n", + " 'anglisht',\n", + " 'lejleku',\n", + " 'Borbardha',\n", + " 'hunda',\n", + " 'gomari',\n", + " 'shkalla krahasore',\n", + " 'shkolla',\n", + " 'mesazhimi me tekst',\n", + " 'zonjushë',\n", + " 'deti',\n", + " 'ëmbëlsirë',\n", + " 'shqip',\n", + " 'te lutem',\n", + " 'buzëqeshje',\n", + " 'luleradhiqe',\n", + " 'kar',\n", + " 'perëndim',\n", + " 'pëllumb',\n", + " 'paragraf',\n", + " 'qimnon',\n", + " 'tradhtar',\n", + " 'piçkë',\n", + " 'veshi',\n", + " 'polonisht',\n", + " 'babi',\n", + " 'gjuha norvegjeze',\n", + " 'gomar',\n", + " 'vajzë',\n", + " 'xhuxh',\n", + " 'lista e kafshëve të egra',\n", + " 'mërzitur',\n", + " 'Shigjetari',\n", + " 'lukth',\n", + " 'papagall',\n", + " 'mirupafshim',\n", + " 'shkallë',\n", + " 'data e lindjes',\n", + " 'nxënës',\n", + " 'fruth',\n", + " 'fletore',\n", + " 'a',\n", + " 'ku je',\n", + " 'shkronja e njësisë',\n", + " 'vepër',\n", + " 'xhamia',\n", + " 'gjuha latine',\n", + " 'kec',\n", + " 'bizele',\n", + " 'këmisha',\n", + " 'mirëmëngjes',\n", + " 'reputacion',\n", + " 'vajguri',\n", + " 'dhuna',\n", + " 'shegë',\n", + " 'kapele',\n", + " 'xinxife',\n", + " 'portofol',\n", + " 'ngjarje',\n", + " 'katror',\n", + " 'qeni',\n", + " 'pranverë',\n", + " 'spec',\n", + " 'dele',\n", + " 'pllakat',\n", + " 'ekzemplar',\n", + " 'fjalëkryqi',\n", + " 'dora',\n", + " 'fustan',\n", + " 'ckemi lalush',\n", + " 'mjellma',\n", + " 'fjalori',\n", + " 'numri personal i identifikimit',\n", + " 'zemra',\n", + " 'zorra',\n", + " 'mashtrues',\n", + " 'peshk',\n", + " 'volejboll',\n", + " 'folja',\n", + " 'thyesa',\n", + " 'shkronja të vogla',\n", + " 'përcjell',\n", + " 'bretkosë',\n", + " 'elbi',\n", + " 'fik',\n", + " 'xhup',\n", + " 'luledielli',\n", + " 'inxhinier',\n", + " 'manaferra',\n", + " 'ligji',\n", + " 'qershi',\n", + " 'nafta',\n", + " 'shall',\n", + " 'zog',\n", + " 'llokum',\n", + " 'uthull',\n", + " 'vetëm për lexim',\n", + " 'kopil',\n", + " 'gomë',\n", + " 'mirënjohje',\n", + " 'udhëtim',\n", + " 'qe',\n", + " 'te',\n", + " 'Dashi',\n", + " 'ftoi',\n", + " 'kur',\n", + " 'artikull',\n", + " 'mirdita',\n", + " 'fshati',\n", + " 'dogana',\n", + " 'urime',\n", + " 'pse',\n", + " 'krehër',\n", + " 'provim',\n", + " 'mandarinë',\n", + " 'love',\n", + " 'mënyra dëftore',\n", + " 'kulumbri',\n", + " 'zot',\n", + " 'breshër',\n", + " 'kërthizë',\n", + " 'shqiponjë',\n", + " 'makina',\n", + " 'rrugë të mbarë',\n", + " 'mbiemri',\n", + " 'anije me vela',\n", + " 'femër',\n", + " 'fejesa',\n", + " 'car',\n", + " 'pidhi',\n", + " 'latinisht',\n", + " 'anije',\n", + " 'ne',\n", + " 'sy',\n", + " 'gju',\n", + " 'Borëbardha',\n", + " 'roman',\n", + " 'flutur',\n", + " 'legjenda',\n", + " 'ylli',\n", + " 'mir',\n", + " 'kaprolli',\n", + " 'bashkëveprimi gravitacional',\n", + " 'përkëdhel',\n", + " 'e mërkurë',\n", + " 'noter',\n", + " 'vegla',\n", + " 'ngjyra e bardhë',\n", + " 'petull',\n", + " 'qirje',\n", + " 'dylbi',\n", + " 'dhelpra',\n", + " 'faqe',\n", + " 'ndikim',\n", + " 'Te kam xhan',\n", + " 'gjuha',\n", + " 'kërkesë',\n", + " 'mjaltë',\n", + " 'film',\n", + " 'gjuha bullgare',\n", + " 'shkruan',\n", + " 'pidh',\n", + " 'selino',\n", + " 'disa',\n", + " 'aeroplani',\n", + " 'përshesh',\n", + " 'antonimet',\n", + " 'mushkëri',\n", + " 'qëllim',\n", + " 'dyshek',\n", + " 'zogu',\n", + " 'mire',\n", + " 'mizë',\n", + " 'anglisht shqip',\n", + " 'vetulla',\n", + " 'mirmengjes',\n", + " 'blej',\n", + " 'bashkëpunim',\n", + " 'biznesi',\n", + " 'mashurka',\n", + " 'selam alejkum',\n", + " 'mbjell',\n", + " 'breshka',\n", + " 'hoxhë',\n", + " 'bajamet',\n", + " 'sorkadhe',\n", + " 'kastravec',\n", + " 'topi',\n", + " 'perde',\n", + " 'hudhër',\n", + " 'fasule',\n", + " 'fjalim',\n", + " 'buf',\n", + " 'neni',\n", + " 'ka',\n", + " 'shigjeta djathtas',\n", + " 'pasqyra',\n", + " 'mallkimi',\n", + " 'handikapat',\n", + " 'gjyshe',\n", + " 'shkronjë kapitale',\n", + " 'hekuri',\n", + " 'futboll',\n", + " 'vazo',\n", + " 'shpirti',\n", + " 'gjeli',\n", + " 'evidentim',\n", + " 'perralla',\n", + " 'perime',\n", + " 'krahasim',\n", + " 'lejlek',\n", + " 'uji',\n", + " 'mashtrim',\n", + " 'trëndafil',\n", + " 'ariu',\n", + " 'djali',\n", + " 'vullnet',\n", + " 'numër rendor',\n", + " 'gozhdë',\n", + " 'paqe',\n", + " 'abetare',\n", + " 'legen',\n", + " 'veta e tretë',\n", + " 'mësime',\n", + " 'dardha',\n", + " 'tavolinë',\n", + " 'amanet',\n", + " 'semiotika',\n", + " 'tavan',\n", + " 'ndaj',\n", + " 'kllapa gjarpërushe e majtë',\n", + " 'gjakmarrja',\n", + " 'rrip',\n", + " 'shumës',\n", + " 'hi',\n", + " 'Mesjeta',\n", + " 'ku',\n", + " 'emri',\n", + " 'ime',\n", + " 'shpirt',\n", + " 'jam',\n", + " 'kam',\n", + " 'Wikipedia',\n", + " 'Evropa e Mesme',\n", + " 'mish',\n", + " 'familja',\n", + " 'qen',\n", + " 'kamfuri',\n", + " 'qeparis',\n", + " 'elokuent',\n", + " 'gjatësia',\n", + " 'nuse',\n", + " 'doracak',\n", + " 'e enjte',\n", + " 'buke',\n", + " 'përdhunim',\n", + " 'kastraveci',\n", + " 'bredh',\n", + " 'çorape',\n", + " 'përrallë',\n", + " 'oborr',\n", + " 'vendim',\n", + " 'perimet',\n", + " 'edhe',\n", + " 'rrjetë',\n", + " 'ngrohtë',\n", + " 'troftë',\n", + " 'pjepër',\n", + " 'paragjykim',\n", + " 'atlete',\n", + " 'fruta',\n", + " 'shkollë',\n", + " 'bre',\n", + " 'feste',\n", + " 'rosa',\n", + " 'bashki',\n", + " 'autorizim',\n", + " 'aparat',\n", + " 'kufje',\n", + " 'shqiptarja',\n", + " 'rendur',\n", + " 'si',\n", + " 'Londra',\n", + " 'xhenxhefil',\n", + " 'skifteri i gjuetisë',\n", + " 'fjongo',\n", + " 'qukë',\n", + " 'mbretëreshë',\n", + " 'dhëndër',\n", + " 'ftesë',\n", + " 'çakmak',\n", + " 'shkumës',\n", + " 'komuna',\n", + " 'akullore',\n", + " 'ekonomist',\n", + " 'ylber',\n", + " 'puna mekanike',\n", + " 'pyll',\n", + " 'sot',\n", + " 'namazi',\n", + " 'vlerësoj',\n", + " 'hartim',\n", + " 'bukë',\n", + " 'tetor',\n", + " 'gëzuar ditëlindjen',\n", + " 'dorza',\n", + " 'përshkruaj',\n", + " 'agjërimi',\n", + " 'byth',\n", + " 'tavolina',\n", + " 'rroba',\n", + " 'fjalor',\n", + " 'pema',\n", + " 'famulltar',\n", + " 'buk',\n", + " 'byzylyk',\n", + " 'turp',\n", + " 'e',\n", + " 'alfabet',\n", + " 'makarona',\n", + " 'baba',\n", + " 'kuptimi',\n", + " 'jastëk',\n", + " 'grua',\n", + " 'nuk kuptoj',\n", + " 'ti',\n", + " 'pengesë',\n", + " 'Shenja e barazimit',\n", + " 'thonjëza të drejta',\n", + " 'budalla',\n", + " 'gjethe dafine',\n", + " 'cjap',\n", + " 'drita',\n", + " 'peshku',\n", + " 'shkurre',\n", + " 'afinitet',\n", + " 'pallat',\n", + " 'prandaj',\n", + " 'shiu acid',\n", + " 'mit',\n", + " 'me fal',\n", + " 'gjilpërë',\n", + " 'nga',\n", + " \"t'boftë mire\",\n", + " 'muaji',\n", + " 'xhaxha',\n", + " 'pula',\n", + " 'dhimbje',\n", + " 'gjykatës',\n", + " 'qaj',\n", + " 'planet',\n", + " 'teknologji',\n", + " 'hajde',\n", + " 'delja',\n", + " 'vjeshta',\n", + " 'seks',\n", + " 'bufi',\n", + " 'avull',\n", + " 'qark',\n", + " 'ndajfolje',\n", + " 'Akrepi',\n", + " 'shembull',\n", + " 'hith',\n", + " 'degjoj',\n", + " 'odiseja',\n", + " 'litar',\n", + " 'ringjallja',\n", + " 'gjuha kroate',\n", + " 'koleg',\n", + " 'potassium',\n", + " 'pushtet',\n", + " 'karrige',\n", + " 'syri i keq',\n", + " 'xheloz',\n", + " 'Demi',\n", + " 'shkoj',\n", + " 'shkronjë e madhe',\n", + " 'sobë',\n", + " 'tigan',\n", + " 'jorgan',\n", + " 'ishte',\n", + " 'thënie',\n", + " 'tung',\n", + " 'i',\n", + " 'festa',\n", + " 'djathi',\n", + " 'shpif',\n", + " 'shoqe',\n", + " 'zjarrfikës',\n", + " 'fjala',\n", + " 'karrabisht',\n", + " 'mace',\n", + " 'truall',\n", + " 'dog',\n", + " 'Viti i Ri',\n", + " 'këpucë',\n", + " 'mbaj',\n", + " 'elb',\n", + " 'pordhë',\n", + " 'sporti',\n", + " 'tavëll duhani',\n", + " 'memec',\n", + " 'lekë',\n", + " 'gjeraqina',\n", + " 'lakra',\n", + " 'pantallona',\n", + " 'patate',\n", + " 'Ujori',\n", + " 'sjellje',\n", + " 'a flet anglisht',\n", + " 'sheqer',\n", + " 'moter',\n", + " 'per',\n", + " 'pus',\n", + " 'lapsi',\n", + " 'lakuriq',\n", + " 'entitet',\n", + " 'pershendetje si jeni',\n", + " 'unazë',\n", + " 'rreze dielli',\n", + " 'si quhesh',\n", + " 'kontabiliteti',\n", + " 'banakier',\n", + " 'bukuroshe',\n", + " 'nga jeni',\n", + " 'relievi',\n", + " 'raft',\n", + " 'buka',\n", + " 'Tekst i gatshëm',\n", + " 'dhuratë',\n", + " 'batanije',\n", + " 'gjobë',\n", + " 'kapela',\n", + " 'pendohem',\n", + " 'shqiponja',\n", + " 'integritet',\n", + " 'urim',\n", + " 'konkurs',\n", + " 'thyerja e fjalës',\n", + " 'shqip anglisht',\n", + " 'azot',\n", + " 'sëmundje',\n", + " 'pranvera',\n", + " 'thekra',\n", + " 'qiri',\n", + " 'bej',\n", + " 'i zgjuar',\n", + " 'Gaforrja',\n", + " 'dera',\n", + " 'mishi',\n", + " 'ngjer',\n", + " 'pije',\n", + " 'bark',\n", + " 'bilbil',\n", + " 'barometri',\n", + " 'xhezve',\n", + " 'shkrues letrash anonime',\n", + " 'do',\n", + " 'un',\n", + " 'timon',\n", + " 'shkurt',\n", + " 'krevat',\n", + " 'kryeqytet',\n", + " 'gjuha hebraike',\n", + " 'ide',\n", + " 'drejtkëndësh',\n", + " 'luleshtrydhe',\n", + " 'karrota',\n", + " 'Zvicra',\n", + " 'inati',\n", + " 'kafe',\n", + " 'shume',\n", + " 'kari',\n", + " 'lindje-jug-lindje',\n", + " 'zjarri',\n", + " 'mbylle gojën',\n", + " 'eshte',\n", + " 'ndryshk',\n", + " 'qanta',\n", + " 'gadishull',\n", + " 'lakmi',\n", + " 'ekskursion',\n", + " 'ethe',\n", + " 'fshesë me korent',\n", + " 'vera',\n", + " 'përemër vetor',\n", + " 'pune',\n", + " 'aplikoj',\n", + " 'pak',\n", + " 'gjuha e nënës',\n", + " 'varreza',\n", + " 'jeta',\n", + " 'fyerje',\n", + " 'sallam',\n", + " 'neser',\n", + " 'pranga',\n", + " 'kurvë',\n", + " 'gazetar',\n", + " 'gjuha serbe',\n", + " 'krokodili',\n", + " 'përveç',\n", + " 'raport',\n", + " 'motër',\n", + " 'mallkim',\n", + " 'lungë',\n", + " 'hallë',\n", + " 'Binjakët',\n", + " 'ishull',\n", + " 'psikologjik',\n", + " 'natyra',\n", + " 'serbishtja',\n", + " 'mjekër',\n", + " 'muaj',\n", + " 'gjuha e kompjuterit',\n", + " 'krushk',\n", + " 'i paditur',\n", + " 'mbështetje',\n", + " 'you',\n", + " 'tregoj',\n", + " 'përmbledhje',\n", + " 'qiqra',\n", + " 'papafingo',\n", + " 'lavaman',\n", + " 'profesioni',\n", + " 'kajsia',\n", + " 'fyell',\n", + " 'shtator',\n", + " 'nga je',\n", + " 'qese',\n", + " 'qesh',\n", + " 'abazhur',\n", + " 'gjak',\n", + " 'dajë',\n", + " 'fjalori i sinonimeve',\n", + " 'gjuha shqipe',\n", + " 'prodhim',\n", + " 'poezia',\n", + " 'trim',\n", + " 'iriq',\n", + " 'home',\n", + " 'fije',\n", + " 'shkak',\n", + " 'guxim',\n", + " 'durimi',\n", + " 'Prefektura',\n", + " 'gruaja',\n", + " 'zotëri',\n", + " 'A',\n", + " 'biznes',\n", + " 'gjuha turke',\n", + " 'dhe',\n", + " 'Kurani',\n", + " 'mama',\n", + " 'gjithashtu',\n", + " 'marr',\n", + " 'mare',\n", + " 'mjegull',\n", + " 'kërmill',\n", + " 'shpatë',\n", + " 'faturë',\n", + " 'cfar',\n", + " 'vajza',\n", + " 'gjymtyrë',\n", + " 'gabim',\n", + " 'drejt',\n", + " 'ngjyra vjollcë',\n", + " 'jo',\n", + " 'my',\n", + " 'un jam',\n", + " 'kukull',\n", + " 'kallëzuesi',\n", + " 'gjinekolog',\n", + " 'laps',\n", + " 'merimangë',\n", + " 'rrushi',\n", + " 'si zakonisht',\n", + " 'merimanga',\n", + " 'parashikim',\n", + " 'detyre shtepie',\n", + " 'nice',\n", + " 'alfabeti cirilik',\n", + " 'shishe',\n", + " 'telefon',\n", + " 'fshesë',\n", + " 'gjatë',\n", + " 'grifsha',\n", + " 'fjala kyçe',\n", + " 'ftoj',\n", + " 'kos',\n", + " 'leopardi',\n", + " 'gjithçka',\n", + " 'kryeartikull',\n", + " 'teknologjia',\n", + " 'e ardhmja',\n", + " 'miu',\n", + " 'betohem',\n", + " 'nip',\n", + " 'nje',\n", + " 'what',\n", + " 'Bukuroshja e Fjetur',\n", + " 'pallua',\n", + " 'nuk ka perse',\n", + " 'kokosh',\n", + " 'qelibar',\n", + " 'e premte',\n", + " 'idiot',\n", + " 'alergji',\n", + " 'mbrapa',\n", + " 'trupi',\n", + " 'për',\n", + " 'test',\n", + " 'majmun',\n", + " 'gjellë',\n", + " 'lopatë',\n", + " 'piktura',\n", + " 'portokalli',\n", + " 'ngjyrë',\n", + " 'mobilje',\n", + " 'këmba',\n", + " 'mund',\n", + " 'dashur',\n", + " 'arra',\n", + " 'buza',\n", + " 'yes',\n", + " 'qeliza',\n", + " 'peshkaqen',\n", + " 'veri-veri-lindje',\n", + " 'qafa',\n", + " 'perëndimi i diellit',\n", + " 'shesh',\n", + " 'gastare',\n", + " 'varëse',\n", + " 'abonohem',\n", + " 'i bukur',\n", + " 'temjan',\n", + " 'pjeshkë',\n", + " 'boronicë',\n", + " 'kokërr',\n", + " 'jetim',\n", + " 'suxhuk',\n", + " 'pata',\n", + " 'infermiere',\n", + " 'doreza e përmasimit',\n", + " 'dyqan',\n", + " 'qelb',\n", + " 'kush',\n", + " 'peshë',\n", + " 'kopësht',\n", + " 'kukuvajka',\n", + " 'jugu',\n", + " 'derri i egër',\n", + " 'shkop',\n", + " 'organo',\n", + " 'qift',\n", + " 'tigri',\n", + " 'tatim',\n", + " 'ari',\n", + " 'hendek',\n", + " 'shalqiri',\n", + " 'dielli',\n", + " 'stafidhe',\n", + " 'hakmarrje',\n", + " 'pica',\n", + " 'oktapod',\n", + " 'ë',\n", + " 'tradhti',\n", + " 'e hënë',\n", + " 'det',\n", + " 'maca',\n", + " 'shpesh',\n", + " 'kujtim',\n", + " 'zbokth',\n", + " 'jeta ime',\n", + " 'mali',\n", + " 'mall',\n", + " 'korbi',\n", + " 'dritare',\n", + " 'thekër',\n", + " 'karrocë',\n", + " 'spinaqi',\n", + " 'fat',\n", + " 'vjell',\n", + " 'rrufe',\n", + " 'mirëmbajtje',\n", + " 'ha',\n", + " 'vetëtima',\n", + " 'në',\n", + " 'qyqja',\n", + " 'shtog',\n", + " 'gri',\n", + " 'arkivol',\n", + " 'ushtrim',\n", + " 'lagje',\n", + " 'dëgjo',\n", + " 'dimri',\n", + " 'merr',\n", + " 'triko',\n", + " 'ngjyra e kuqe',\n", + " 'puthje',\n", + " 'nga e djathta në të majtë',\n", + " 'gjalp',\n", + " 'rafte',\n", + " 'injorant',\n", + " 'kapsolla',\n", + " 'nuselalë',\n", + " 'të dua',\n", + " 'leje',\n", + " 'detyrim',\n", + " 'drejtor',\n", + " 'shfrytëzoj',\n", + " 'flori',\n", + " 'shigjetë',\n", + " 'dashuria ime',\n", + " 'bimë',\n", + " 'punoi',\n", + " 'punoj',\n", + " 'maj',\n", + " 'qytet',\n", + " 'homogjen',\n", + " 'ndodhet',\n", + " 'presje',\n", + " 'pyetjet më të shpeshta',\n", + " 'Luani',\n", + " 'shmang',\n", + " 'nuk',\n", + " 'lakuriqi i natës',\n", + " 'vesh',\n", + " 'dreq',\n", + " 'sepse',\n", + " 'gjatësi',\n", + " 'kikirik',\n", + " 'lindje',\n", + " 'porosi',\n", + " 'libër',\n", + " 'qep',\n", + " 'kishte',\n", + " 'tavë',\n", + " 'sorra',\n", + " 'pranoj',\n", + " 'dush',\n", + " 'gatuaj',\n", + " 'bota',\n", + " 'forca elektromagnetike',\n", + " 'shi',\n", + " 'kuzhinë',\n", + " 'benzinë',\n", + " 'bumerang',\n", + " 'kuzhinier',\n", + " 'top',\n", + " 'si quheni',\n", + " 'dolli',\n", + " 'Sesa',\n", + " 'bujqësi',\n", + " 'loqe',\n", + " 'sytjena',\n", + " 'Microsoft PowerPoint Web App',\n", + " 'kuadër',\n", + " 'bibilush',\n", + " 'dafine',\n", + " 'kërshëria',\n", + " 'autobiografi',\n", + " 'qelbës',\n", + " 'kalendari gregorian',\n", + " 'hena',\n", + " 'nxit',\n", + " 'adresa e shtëpisë',\n", + " 'nishan',\n", + " 'avokat',\n", + " 'kocë',\n", + " 'mulli',\n", + " 'liqeni',\n", + " 'yll',\n", + " 'numrat',\n", + " 'rreth',\n", + " 'marramendje',\n", + " 'pengoj',\n", + " 'perëndimi',\n", + " 'shok',\n", + " 'takim',\n", + " 'ushtar',\n", + " 'lugë',\n", + " 'kopsht',\n", + " 'pendë',\n", + " 'hierarki',\n", + " 'bashkëjetesë',\n", + " 'tërmet',\n", + " 'mrekulli',\n", + " 'korrekt',\n", + " 'abrogim',\n", + " 'inat',\n", + " 'dosje',\n", + " 'vuaj',\n", + " 'qepa',\n", + " 'musht',\n", + " 'uriq',\n", + " 'pëllumbi',\n", + " 'menyja e kontekstit',\n", + " 'Britania e Madhe',\n", + " 'cfare',\n", + " 'mësimi',\n", + " 'subvencion',\n", + " 'analiza'],\n", + " [],\n", + " [],\n", + " []]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import requests\n", + "from bs4 import BeautifulSoup\n", + "\n", + "def scrape_shqip():\n", + " ranges = [\n", + " \"0-1000\",\n", + " \"1000-2000\",\n", + " \"2000-3000\",\n", + " \"3000-4000\"\n", + " ]\n", + " \n", + " sq_words = []\n", + " url = \"https://glosbe.com/topwords/sq/en/\"\n", + " for r in ranges:\n", + " url += r\n", + " page = requests.get(url)\n", + " soup = BeautifulSoup(page.content, 'html.parser')\n", + " words = soup.find_all('li', {'class':'mb-4'})\n", + " words = [words.get_text().strip().partition('\\n')[-1] for words in words]\n", + " sq_words.append(words)\n", + " \n", + " return sq_words\n", + "\n", + "scrape_shqip()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9c2da377-e4f0-4153-b1b5-3d03c312e2cc", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/lab/lab_11.ipynb b/lab/lab_11.ipynb index 0a8ce14..db923ae 100644 --- a/lab/lab_11.ipynb +++ b/lab/lab_11.ipynb @@ -52,13 +52,37 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 116, "id": "german-dispute", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['To jest przykładowy tekst.',\n", + " 'Nie wiem czym jest',\n", + " 'Python,',\n", + " 'ASCII i',\n", + " 'UNICODE.',\n", + " 'Ósmy raz sięgam ręką po borówkę.',\n", + " 'Żądło pszczoły jest kujące']" + ] + }, + "execution_count": 116, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ + "import regex\n", + "\n", "def sentence_split(text):\n", - " return []" + " pattern = r'\\s(?=\\p{Lu})'\n", + " segments = regex.split(pattern, text)\n", + " return segments\n", + "\n", + "text = \"To jest przykładowy tekst. Nie wiem czym jest Python, ASCII i UNICODE. Ósmy raz sięgam ręką po borówkę. Żądło pszczoły jest kujące\"\n", + "sentence_split(text)" ] }, { @@ -71,13 +95,251 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 69, "id": "guilty-morocco", "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Jeżeli chodzi o UNICODE czy ASCII to nie wiem co to jest',\n", + " 'Jestem S444820 [',\n", + " 'Krystian Osiński',\n", + " '] Lubię jeść pizze',\n", + " 'Mam konto w banku Peako S.A i jestem z niego zadowolony.']" + ] + }, + "execution_count": 69, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import regex as r\n", + "def sentence_split_enhanced(text):\n", + " #Ulepszenie algorytmu: szukam indexu znaku białego który jest poprzedzany znakiem .?! i następuje po nim duża litera, ale przed znakiem musi wystąpić litera a-z (nie może) A-Z\n", + " # Dltaego dodatkowmo nie dziele zdań gdy wystąpią skróty pokroju S.A. oraz w przypadku wystąpienia Dużej litery w środku zdania -> segment nie jest tworzony.\n", + " # W dodatku obsługuję nawiasy () i [] -> pobieram zdania występujące w nawiasie\n", + "\n", + " pattern = r'(? https://www.europarl.europa.eu/about-parliament/en/democracy-and-human-rights\n", + "# en -> https://www.europarl.europa.eu/about-parliament/en/democracy-and-human-rights\n", + "# plik wygenerowany: aligned_pl-en.txt\n", + "def convert2xliff(hunalign_file_name,new_xliff_file_name):\n", + " text = \"\"\"\n", + " \n", + " \n", + "
\n", + " \n", + " Krystian Osinski\n", + " \n", + "
\n", + " \"\"\"\n", + " with open(hunalign_file_name, 'r') as file:\n", + " lines = file.readlines()\n", + " for line in lines:\n", + " source, target, pl = line.strip().split('\\t')\n", + " text += f\"\"\"\n", + " {source}\n", + " {target}\n", + " \"\"\"\n", + " text += r\"\"\"\n", + "
\n", + "
\"\"\"\n", + " \n", + " with open('output_xml_file.xml', 'w') as xml_file:\n", + " xml_file.write(text)\n", + " \n", + " \n", + " return 0\n", + "\n", + "convert2xliff(\"aligned_pl-en.txt\",\"xliff_pl_en\")" ] } ], "metadata": { "author": "Rafał Jaworski", "email": "rjawor@amu.edu.pl", - "lang": "pl", - "subtitle": "11. Urównoleglanie", - "title": "Komputerowe wspomaganie tłumaczenia", - "year": "2021", "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, + "lang": "pl", "language_info": { "codemirror_mode": { "name": "ipython", @@ -206,8 +508,11 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10" - } + "version": "3.9.2" + }, + "subtitle": "11. Urównoleglanie", + "title": "Komputerowe wspomaganie tłumaczenia", + "year": "2021" }, "nbformat": 4, "nbformat_minor": 5