From 5fc8388eefd4c6023227c855dc3895b16e5d7e8a Mon Sep 17 00:00:00 2001 From: Filip Gralinski Date: Wed, 17 Mar 2021 11:03:03 +0100 Subject: [PATCH] =?UTF-8?q?Drugi=20wyk=C5=82ad?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- wyk/01_Wyszukiwarki-wprowadzenie.ipynb | 872 ++++++++++++++++++++++++- wyk/02_Wyszukiwarki-roboty.ipynb | 521 +++++++++++++++ wyk/aria2c-example/aria.in | 4 + 3 files changed, 1396 insertions(+), 1 deletion(-) create mode 100644 wyk/02_Wyszukiwarki-roboty.ipynb create mode 100644 wyk/aria2c-example/aria.in diff --git a/wyk/01_Wyszukiwarki-wprowadzenie.ipynb b/wyk/01_Wyszukiwarki-wprowadzenie.ipynb index 9ede925..6015e5f 100644 --- a/wyk/01_Wyszukiwarki-wprowadzenie.ipynb +++ b/wyk/01_Wyszukiwarki-wprowadzenie.ipynb @@ -6,7 +6,7 @@ "source": [ "# Wyszukiwarki - wprowadzenie\n", "\n", - "## Systemy wyszukiwania informacji\n", + "## Systemy wyszukiwania informacji (information retrieval systems)\n", "\n", "![System wyszukiwania informacji](system-wyszukiwania-informacji.png)" ] @@ -800,6 +800,876 @@ "* aplikacja pozwala wylistować wszystkie wyniki oznaczone do tej pory jako interesujące" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Czego nie brać?\n", + "\n", + "Standard robots.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "User-agent: *\n", + "Disallow: /*/wyszukaj/\n", + "Disallow: /*servlet\n", + "Disallow: /reloadwww?\n", + "Disallow: /dfptools/adview/\n", + "Disallow: /pub/ips/*\n", + "Disallow: /ods?\n", + "Disallow: /getFile.servlet*\n", + "Disallow: /aliasy/blad.jsp\n", + "Disallow: /znajdz.do\n", + "Disallow: /portalSearch.do\n", + "Disallow: /im/ab/b4/10/z17515435Q.jpg\n", + "Disallow: /75224259/\n", + "\n", + "User-agent: Googlebot-News\n", + "Disallow: /nowy/\n", + "Disallow: /mapa_strony\n", + "Disallow: /*/wyszukaj/\n", + "Disallow: /*/51,\n", + "Disallow: /*/55,\n", + "Disallow: /*/2,\n", + "Disallow: /*order=\n", + "Disallow: /*obxx=\n", + "Disallow: /*tag=\n", + "Disallow: /reloadwww?\n", + "Disallow: /ods?\n", + "Disallow: /*servlet\n", + "Disallow: /dfptools/adview/\n", + "\n", + "User-agent: Yandex\n", + "Disallow: /\n", + "\n", + "User-Agent: bingbot\n", + "Disallow: /\n", + "\n", + "User-agent: 008\n", + "Disallow: /\n", + "\n", + "User-agent: 010\n", + "Disallow: /\n", + "\n", + "User-agent: 360Spider\n", + "Disallow: /\n", + "\n", + "User-agent: 80legs\n", + "Disallow: /\n", + "\n", + "User-agent: Aboundex\n", + "Disallow: /\n", + "\n", + "User-agent: accelobot\n", + "Disallow: /\n", + "\n", + "User-agent: Add\\ Catalog\n", + "Disallow: /\n", + "\n", + "User-agent: AhrefsBot\n", + "Disallow: /\n", + "\n", + "User-agent: aiHitBot\n", + "Disallow: /\n", + "\n", + "User-agent: Alexibot\n", + "Disallow: /\n", + "\n", + "User-agent: Aqua_Products\n", + "Disallow: /\n", + "\n", + "User-agent: AskJeeves\n", + "Disallow: /\n", + "\n", + "User-agent: asterias\n", + "Disallow: /\n", + "\n", + "User-agent: awcheckBot\n", + "Disallow: /\n", + "\n", + "User-agent: b2w/0.1\n", + "Disallow: /\n", + "\n", + "User-agent: BackDoorBot/1.0\n", + "Disallow: /\n", + "\n", + "User-agent: BacklinkCrawler\n", + "Disallow: /\n", + "\n", + "User-agent: Baiduspider\n", + "Disallow: /\n", + "\n", + "User-agent: BecomeBot\n", + "Disallow: /\n", + "\n", + "User-agent: BLEXBot\n", + "Disallow: /\n", + "\n", + "User-agent: BlowFish/1.0\n", + "Disallow: /\n", + "\n", + "User-agent: Bookmark search tool\n", + "Disallow: /\n", + "\n", + "User-agent: BotALot\n", + "Disallow: /\n", + "\n", + "User-agent: brandwatch.net\n", + "Disallow: /\n", + "\n", + "User-agent: BuiltBotTough\n", + "Disallow: /\n", + "\n", + "User-agent: Bullseye/1.0\n", + "Disallow: /\n", + "\n", + "User-agent: BunnySlippers\n", + "Disallow: /\n", + "\n", + "User-agent: Butterfly\n", + "Disallow: /\n", + "\n", + "User-agent: CatchBot\n", + "Disallow: /\n", + "\n", + "User-agent: Charlotte\n", + "Disallow: /\n", + "\n", + "User-agent: CheeseBot\n", + "Disallow: /\n", + "\n", + "User-agent: CherryPicker\n", + "Disallow: /\n", + "\n", + "User-agent: CherryPickerElite/1.0\n", + "Disallow: /\n", + "\n", + "User-agent: CherryPickerSE/1.0\n", + "Disallow: /\n", + "\n", + "User-agent: CLIPish\n", + "Disallow: /\n", + "\n", + "User-agent: Cliqzbot\n", + "Disallow: /\n", + "\n", + "User-agent: COMODO\n", + "Disallow: /\n", + "\n", + "User-agent: Comodo-Certificates-Spider\n", + "Disallow: /\n", + "\n", + "User-agent: CompSpyBot\n", + "Disallow: /\n", + "\n", + "User-agent: Copernic\n", + "Disallow: /\n", + "\n", + "User-agent: CopyRightCheck\n", + "Disallow: /\n", + "\n", + "User-agent: cosmos\n", + "Disallow: /\n", + "\n", + "User-agent: crawler\n", + "Disallow: /\n", + "\n", + "User-agent: Crescent\n", + "Disallow: /\n", + "\n", + "User-agent: Crescent Internet ToolPak HTTP OLE Control v.1.0\n", + "Disallow: /\n", + "\n", + "User-agent: Curious\n", + "Disallow: /\n", + "\n", + "User-agent: curl\n", + "Disallow: /\n", + "\n", + "User-agent: dataprovider\\.com\n", + "Disallow: /\n", + "\n", + "User-agent: DinoPing\n", + "Disallow: /\n", + "\n", + "User-agent: discoverybot\n", + "Disallow: /\n", + "\n", + "User-agent: DittoSpyder\n", + "Disallow: /\n", + "\n", + "User-agent: DomainCrawler\n", + "Disallow: /\n", + "\n", + "User-agent: DomainCrawler\n", + "Disallow: /\n", + "\n", + "User-agent: dotbot\n", + "Disallow: /\n", + "\n", + "User-agent: dotnetdotcom\n", + "Disallow: /\n", + "\n", + "User-agent: Dow\\ Jones\\ Searchbot\n", + "Disallow: /\n", + "\n", + "User-agent: dumbot\n", + "Disallow: /\n", + "\n", + "User-agent: EasouSpider\n", + "Disallow: /\n", + "\n", + "User-agent: EmailCollector\n", + "Disallow: /\n", + "\n", + "User-agent: EmailSiphon\n", + "Disallow: /\n", + "\n", + "User-agent: EmailWolf\n", + "Disallow: /\n", + "\n", + "User-agent: Enterprise_Search\n", + "Disallow: /\n", + "\n", + "User-agent: Enterprise_Search/1.0\n", + "Disallow: /\n", + "\n", + "User-agent: EroCrawler\n", + "Disallow: /\n", + "\n", + "User-agent: es\n", + "Disallow: /\n", + "\n", + "User-agent: Exabot\n", + "Disallow: /\n", + "\n", + "User-agent: ExtractorPro\n", + "Disallow: /\n", + "\n", + "User-agent: EzineArticlesLinkScanner\n", + "Disallow: /\n", + "\n", + "User-agent: Ezooms\n", + "Disallow: /\n", + "\n", + "User-agent: FairAd Client\n", + "Disallow: /\n", + "\n", + "User-agent: Flaming AttackBot\n", + "Disallow: /\n", + "\n", + "User-agent: Foobot\n", + "Disallow: /\n", + "\n", + "User-agent: FreeFind\n", + "Disallow: /\n", + "\n", + "User-agent: FTRF\\:\\ Friendly\n", + "Disallow: /\n", + "\n", + "User-agent: Gaisbot\n", + "Disallow: /\n", + "\n", + "User-agent: GetRight/4.2\n", + "Disallow: /\n", + "\n", + "User-agent: gigabot\n", + "Disallow: /\n", + "\n", + "User-agent: grub\n", + "Disallow: /\n", + "\n", + "User-agent: grub-client\n", + "Disallow: /\n", + "\n", + "User-agent: Harvest/1.5\n", + "Disallow: /\n", + "\n", + "User-agent: Hatena Antenna\n", + "Disallow: /\n", + "\n", + "User-agent: hloader\n", + "Disallow: /\n", + "\n", + "User-agent: http://www.SearchEngineWorld.com bot\n", + "Disallow: /\n", + "\n", + "User-agent: http://www.WebmasterWorld.com bot\n", + "Disallow: /\n", + "\n", + "User-agent: HTTP_Request\n", + "Disallow: /\n", + "\n", + "User-agent: HTTP_Request2\n", + "Disallow: /\n", + "\n", + "User-agent: httplib\n", + "Disallow: /\n", + "\n", + "User-agent: humanlinks\n", + "Disallow: /\n", + "\n", + "User-agent: ia_archiver\n", + "Disallow: /\n", + "\n", + "User-agent: ia_archiver\n", + "Disallow: /\n", + "\n", + "User-agent: ia_archiver/1.6\n", + "Disallow: /\n", + "\n", + "User-agent: Indy\\ Library\n", + "Disallow: /\n", + "\n", + "User-agent: InfoNaviRobot\n", + "Disallow: /\n", + "\n", + "User-agent: ip\\-web\\-crawler\\.com\n", + "Disallow: /\n", + "\n", + "User-agent: Iron33/1.0.2\n", + "Disallow: /\n", + "\n", + "User-agent: Jakarta\\ Commons-HttpClient\n", + "Disallow: /\n", + "\n", + "User-agent: Jeeves\n", + "Disallow: /\n", + "\n", + "User-agent: JennyBot\n", + "Disallow: /\n", + "\n", + "User-agent: Jetbot\n", + "Disallow: /\n", + "\n", + "User-agent: Jetbot/1.0\n", + "Disallow: /\n", + "\n", + "User-agent: JikeSpider\n", + "Disallow: /\n", + "\n", + "User-agent: Kenjin Spider\n", + "Disallow: /\n", + "\n", + "User-agent: Keyword Density/0.9\n", + "Disallow: /\n", + "\n", + "User-agent: larbin\n", + "Disallow: /\n", + "\n", + "User-agent: LexiBot\n", + "Disallow: /\n", + "\n", + "User-agent: libWeb/clsHTTP\n", + "Disallow: /\n", + "\n", + "User-agent: libwww-perl\n", + "Disallow: /\n", + "\n", + "User-agent: lindex\\.com\n", + "Disallow: /\n", + "\n", + "User-agent: linkdex\\.com\n", + "Disallow: /\n", + "\n", + "User-agent: linkdexbot\n", + "Disallow: /\n", + "\n", + "User-agent: LinkextractorPro\n", + "Disallow: /\n", + "\n", + "User-agent: LinkScan/8.1a Unix\n", + "Disallow: /\n", + "\n", + "User-agent: LinkWalker\n", + "Disallow: /\n", + "\n", + "User-agent: lipperhey\n", + "Disallow: /\n", + "\n", + "User-agent: LNSpiderguy\n", + "Disallow: /\n", + "\n", + "User-agent: looksmart\n", + "Disallow: /\n", + "\n", + "User-agent: ltbot\n", + "Disallow: /\n", + "\n", + "User-agent: lwp-trivial\n", + "Disallow: /\n", + "\n", + "User-agent: lwp-trivial/1.34\n", + "Disallow: /\n", + "\n", + "User-agent: Lynx\n", + "Disallow: /\n", + "\n", + "User-agent: magpie\\-crawler\n", + "Disallow: /\n", + "\n", + "User-agent: Mata Hari\n", + "Disallow: /\n", + "\n", + "User-agent: Microsoft URL Control\n", + "Disallow: /\n", + "\n", + "User-agent: Microsoft URL Control - 5.01.4511\n", + "Disallow: /\n", + "\n", + "User-agent: Microsoft URL Control - 6.00.8169\n", + "Disallow: /\n", + "\n", + "User-agent: MIIxpc\n", + "Disallow: /\n", + "\n", + "User-agent: MIIxpc/4.2\n", + "Disallow: /\n", + "\n", + "User-agent: Mister PiX\n", + "Disallow: /\n", + "\n", + "User-agent: MJ12bot\n", + "Disallow: /\n", + "\n", + "User-agent: moget\n", + "Disallow: /\n", + "\n", + "User-agent: moget/2.1\n", + "Disallow: /\n", + "\n", + "User-agent: Mozilla/4.0 (compatible; BullsEye; Windows 95)\n", + "Disallow: /\n", + "\n", + "User-agent: MSIE\\ or\\ Firefox\\ mutant\n", + "Disallow: /\n", + "\n", + "User-agent: MSIECrawler\n", + "Disallow: /\n", + "\n", + "User-agent: naver\n", + "Disallow: /\n", + "\n", + "User-agent: NCBot\n", + "Disallow: /\n", + "\n", + "User-agent: NetAnts\n", + "Disallow: /\n", + "\n", + "User-agent: NetcraftSurveyAgent\n", + "Disallow: /\n", + "\n", + "User-agent: netEstate\\ NE\\ Crawler\n", + "Disallow: /\n", + "\n", + "User-agent: NetMechanic\n", + "Disallow: /\n", + "\n", + "User-agent: Netseer\n", + "Disallow: /\n", + "\n", + "User-agent: NextGenSearchBot\n", + "Disallow: /\n", + "\n", + "User-agent: NICErsPRO\n", + "Disallow: /\n", + "\n", + "User-agent: Nutch\n", + "Disallow: /\n", + "\n", + "User-agent: Nutch\n", + "Disallow: /\n", + "\n", + "User-agent: Ocelli\n", + "Disallow: /\n", + "\n", + "User-agent: Offline Explorer\n", + "Disallow: /\n", + "\n", + "User-agent: OmniExplorer_Bot\n", + "Disallow: /\n", + "\n", + "User-agent: Openbot\n", + "Disallow: /\n", + "\n", + "User-agent: Openfind\n", + "Disallow: /\n", + "\n", + "User-agent: Openfind\n", + "Disallow: /\n", + "\n", + "User-agent: Openfind data gathere\n", + "Disallow: /\n", + "\n", + "User-agent: OpenWebIndex\n", + "Disallow: /\n", + "\n", + "User-agent: Oracle Ultra Search\n", + "Disallow: /\n", + "\n", + "User-agent: PagesInventory\n", + "Disallow: /\n", + "\n", + "User-agent: PEAR\n", + "Disallow: /\n", + "\n", + "User-agent: PeoplePal\n", + "Disallow: /\n", + "\n", + "User-agent: PerMan\n", + "Disallow: /\n", + "\n", + "User-agent: ProCogSEOBot\n", + "Disallow: /\n", + "\n", + "User-agent: ProPowerBot/2.14\n", + "Disallow: /\n", + "\n", + "User-agent: ProWebWalker\n", + "Disallow: /\n", + "\n", + "User-agent: proximic\n", + "Disallow: /\n", + "\n", + "User-agent: psbot\n", + "Disallow: /\n", + "\n", + "User-agent: purebot\n", + "Disallow: /\n", + "\n", + "User-agent: QueryN Metasearch\n", + "Disallow: /\n", + "\n", + "User-agent: QuerySeekerSpider\n", + "Disallow: /\n", + "\n", + "User-agent: Radiation Retriever 1.1\n", + "Disallow: /\n", + "\n", + "User-agent: RepoMonkey\n", + "Disallow: /\n", + "\n", + "User-agent: RepoMonkey Bait & Tackle/v1.01\n", + "Disallow: /\n", + "\n", + "User-agent: Riddler\n", + "Disallow: /\n", + "\n", + "User-agent: RMA\n", + "Disallow: /\n", + "\n", + "User-agent: rojerbot\n", + "Disallow: /\n", + "\n", + "User-agent: RyteBot\n", + "Disallow: /\n", + "\n", + "User-agent: scooter\n", + "Disallow: /\n", + "\n", + "User-agent: ScoutJet\n", + "Disallow: /\n", + "\n", + "User-agent: Scrapy\n", + "Disallow: /\n", + "\n", + "User-agent: ScreenerBot\n", + "Disallow: /\n", + "\n", + "User-agent: searchmetrics\n", + "Disallow: /\n", + "\n", + "User-agent: searchpreview\n", + "Disallow: /\n", + "\n", + "User-agent: SemrushBot\n", + "Disallow: /\n", + "\n", + "User-agent: sentibot\n", + "Disallow: /\n", + "\n", + "User-agent: SEO-CRAWLING\n", + "Disallow: /\n", + "\n", + "User-agent: SEOENGWorldBot\n", + "Disallow: /\n", + "\n", + "User-agent: SEOkicks-Robot\n", + "Disallow: /\n", + "\n", + "User-agent: ShopWiki\n", + "Disallow: /\n", + "\n", + "User-agent: sistrix\n", + "Disallow: /\n", + "\n", + "User-agent: sitebot\n", + "Disallow: /\n", + "\n", + "User-agent: SiteSnagger\n", + "Disallow: /\n", + "\n", + "User-agent: Snoopy\n", + "Disallow: /\n", + "\n", + "User-agent: SocialSearcher\n", + "Disallow: /\n", + "\n", + "User-agent: Sogou\n", + "Disallow: /\n", + "\n", + "User-agent: SolomonoBot\n", + "Disallow: /\n", + "\n", + "User-agent: sootle\n", + "Disallow: /\n", + "\n", + "User-agent: Sosospider\n", + "Disallow: /\n", + "\n", + "User-agent: SpankBot\n", + "Disallow: /\n", + "\n", + "User-agent: spanner\n", + "Disallow: /\n", + "\n", + "User-agent: spbot\n", + "Disallow: /\n", + "\n", + "User-agent: Speedy\n", + "Disallow: /\n", + "\n", + "User-agent: Stanford\n", + "Disallow: /\n", + "\n", + "User-agent: Stanford Comp Sci\n", + "Disallow: /\n", + "\n", + "User-agent: SurveyBot\n", + "Disallow: /\n", + "\n", + "User-agent: suzuran\n", + "Disallow: /\n", + "\n", + "User-agent: Szukacz/1.4\n", + "Disallow: /\n", + "\n", + "User-agent: Szukacz/1.4\n", + "Disallow: /\n", + "\n", + "User-agent: Teleport\n", + "Disallow: /\n", + "\n", + "User-agent: TeleportPro\n", + "Disallow: /\n", + "\n", + "User-agent: Telesoft\n", + "Disallow: /\n", + "\n", + "User-agent: Teoma\n", + "Disallow: /\n", + "\n", + "User-agent: The Intraformant\n", + "Disallow: /\n", + "\n", + "User-agent: The\\ Incutio\\ XML-RPC\\ PHP\\ Library\n", + "Disallow: /\n", + "\n", + "User-agent: TheNomad\n", + "Disallow: /\n", + "\n", + "User-agent: toCrawl/UrlDispatcher\n", + "Disallow: /\n", + "\n", + "User-agent: True_Robot\n", + "Disallow: /\n", + "\n", + "User-agent: True_Robot/1.0\n", + "Disallow: /\n", + "\n", + "User-agent: turingos\n", + "Disallow: /\n", + "\n", + "User-agent: TurnitinBot\n", + "Disallow: /\n", + "\n", + "User-agent: uCrawler\n", + "Disallow: /\n", + "\n", + "User-agent: URL Control\n", + "Disallow: /\n", + "\n", + "User-agent: URL_Spider_Pro\n", + "Disallow: /\n", + "\n", + "User-agent: URLy Warning\n", + "Disallow: /\n", + "\n", + "User-agent: VCI\n", + "Disallow: /\n", + "\n", + "User-agent: VCI WebViewer VCI WebViewer Win32\n", + "Disallow: /\n", + "\n", + "User-agent: visaduhoc\\.info\n", + "Disallow: /\n", + "\n", + "User-agent: WBSearchBot\n", + "Disallow: /\n", + "\n", + "User-agent: Web Image Collector\n", + "Disallow: /\n", + "\n", + "User-agent: WebAuto\n", + "Disallow: /\n", + "\n", + "User-agent: WebBandit\n", + "Disallow: /\n", + "\n", + "User-agent: WebBandit/3.50\n", + "Disallow: /\n", + "\n", + "User-agent: WebCapture\n", + "Disallow: /\n", + "\n", + "User-agent: WebCopier\n", + "Disallow: /\n", + "\n", + "User-agent: WebEnhancer\n", + "Disallow: /\n", + "\n", + "User-agent: WebInDetail\\.com\n", + "Disallow: /\n", + "\n", + "User-agent: WebmasterWorld Extractor\n", + "Disallow: /\n", + "\n", + "User-agent: WebmasterWorldForumBot\n", + "Disallow: /\n", + "\n", + "User-agent: WebSauger\n", + "Disallow: /\n", + "\n", + "User-agent: Website Quester\n", + "Disallow: /\n", + "\n", + "User-agent: WEBSITEtheWEB\\.COM\n", + "Disallow: /\n", + "\n", + "User-agent: Webster Pro\n", + "Disallow: /\n", + "\n", + "User-agent: WebStripper\n", + "Disallow: /\n", + "\n", + "User-agent: WebVac\n", + "Disallow: /\n", + "\n", + "User-agent: WebZip\n", + "Disallow: /\n", + "\n", + "User-agent: WebZip/4.0\n", + "Disallow: /\n", + "\n", + "User-agent: Wget\n", + "Disallow: /\n", + "\n", + "User-agent: Wget/1.5.3\n", + "Disallow: /\n", + "\n", + "User-agent: Wget/1.6\n", + "Disallow: /\n", + "\n", + "User-agent: Wotbot\n", + "Disallow: /\n", + "\n", + "User-agent: www\\.integromedb\\.org\n", + "Disallow: /\n", + "\n", + "User-agent: WWW-Collector-E\n", + "Disallow: /\n", + "\n", + "User-agent: Xenu's\n", + "Disallow: /\n", + "\n", + "User-agent: Xenu's Link Sleuth 1.1c\n", + "Disallow: /\n", + "\n", + "User-agent: xpymep\\.exe\n", + "Disallow: /\n", + "\n", + "User-agent: YamanaLab-Robot\n", + "Disallow: /\n", + "\n", + "User-agent: YisouSpider\n", + "Disallow: /\n", + "\n", + "User-agent: YodaoBot\n", + "Disallow: /\n", + "\n", + "User-agent: YoudaoBot\n", + "Disallow: /\n", + "\n", + "User-agent: Zend_Http_Client\n", + "Disallow: /\n", + "\n", + "User-agent: Zeus\n", + "Disallow: /\n", + "\n", + "User-agent: Zeus 32297 Webster Pro V2.9 Win32\n", + "Disallow: /\n", + "\n", + "User-agent: Zeus Link Scout\n", + "Disallow: /\n", + "\n", + "User-agent: ZmEu\n", + "Disallow: /\n", + "\n", + "User-agent: ZumBot\n", + "Disallow: /\n", + "\n", + "User-agent: Linguee\n", + "Disallow: /\n", + "\n", + "User-agent: sogou\n", + "Disallow: /\n" + ] + } + ], + "source": [ + "import urllib\n", + "import requests\n", + "\n", + "url = 'https://gazeta.pl/robots.txt'\n", + "response = requests.get(url)\n", + "print(response.content.decode('utf-8'))\n", + "\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Projekt 2\n", + "\n", + "Opracować wyszukiwarkę plików robots.txt.\n", + "\n", + "* pobrać robots.txt dla (prawie) wszystkich polskich stron WWW\n", + "* umożliwić wyszukiwanie i sortowanie według wszystkich możliwych pól (blokowana wyszukiwarka, adres, komentarz,\n", + "długość pliku itd.)\n", + "* opracować miary pozwalające automatycznie wyłuskać „ciekawe” pliki robots.txt (długość, występowanie pełnych\n", + "linków, odmienność od innych plików robots.txt); umożliwić sortowanie/filtrowanie według tej miary" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/wyk/02_Wyszukiwarki-roboty.ipynb b/wyk/02_Wyszukiwarki-roboty.ipynb new file mode 100644 index 0000000..4e207ba --- /dev/null +++ b/wyk/02_Wyszukiwarki-roboty.ipynb @@ -0,0 +1,521 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Jak stworzyć swojego robota?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Narzędzia uruchamiane z wiersza poleceń\n", + "\n", + "* wget\n", + "* curl\n", + "* aria2c" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/\n", + "Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'\n", + "Resolving laboratoria.wmi.amu.edu.pl (laboratoria.wmi.amu.edu.pl)... 150.254.78.3\n", + "Connecting to laboratoria.wmi.amu.edu.pl (laboratoria.wmi.amu.edu.pl)|150.254.78.3|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 6269 (6.1K) [text/html]\n", + "Saving to: 'laboratoria.wmi.amu.edu.pl/index.html'\n", + "\n", + "laboratoria.wmi.amu 100%[===================>] 6.12K --.-KB/s in 0.001s \n", + "\n", + "2021-03-17 09:25:32 (4.19 MB/s) - 'laboratoria.wmi.amu.edu.pl/index.html' saved [6269/6269]\n", + "\n", + "Loading robots.txt; please ignore errors.\n", + "--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/robots.txt\n", + "Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n", + "HTTP request sent, awaiting response... 403 Forbidden\n", + "2021-03-17 09:25:32 ERROR 403: Forbidden.\n", + "\n", + "--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/page-resources/wmi.png\n", + "Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 596 [image/png]\n", + "Saving to: 'laboratoria.wmi.amu.edu.pl/page-resources/wmi.png'\n", + "\n", + "laboratoria.wmi.amu 100%[===================>] 596 --.-KB/s in 0s \n", + "\n", + "2021-03-17 09:25:32 (53.7 MB/s) - 'laboratoria.wmi.amu.edu.pl/page-resources/wmi.png' saved [596/596]\n", + "\n", + "--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/css/labs.css\n", + "Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 6919 (6.8K) [text/css]\n", + "Saving to: 'laboratoria.wmi.amu.edu.pl/css/labs.css'\n", + "\n", + "laboratoria.wmi.amu 100%[===================>] 6.76K --.-KB/s in 0s \n", + "\n", + "2021-03-17 09:25:32 (18.5 MB/s) - 'laboratoria.wmi.amu.edu.pl/css/labs.css' saved [6919/6919]\n", + "\n", + "--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/en/\n", + "Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 5946 (5.8K) [text/html]\n", + "Saving to: 'laboratoria.wmi.amu.edu.pl/en/index.html'\n", + "\n", + "laboratoria.wmi.amu 100%[===================>] 5.81K --.-KB/s in 0.002s \n", + "\n", + "2021-03-17 09:25:32 (3.04 MB/s) - 'laboratoria.wmi.amu.edu.pl/en/index.html' saved [5946/5946]\n", + "\n", + "--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/page-resources/wmi_transparent.png\n", + "Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 15034 (15K) [image/png]\n", + "Saving to: 'laboratoria.wmi.amu.edu.pl/page-resources/wmi_transparent.png'\n", + "\n", + "laboratoria.wmi.amu 100%[===================>] 14.68K --.-KB/s in 0.005s \n", + "\n", + "2021-03-17 09:25:32 (2.62 MB/s) - 'laboratoria.wmi.amu.edu.pl/page-resources/wmi_transparent.png' saved [15034/15034]\n", + "\n", + "--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/godziny-otwarcia/\n", + "Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 5317 (5.2K) [text/html]\n", + "Saving to: 'laboratoria.wmi.amu.edu.pl/godziny-otwarcia/index.html'\n", + "\n", + "laboratoria.wmi.amu 100%[===================>] 5.19K --.-KB/s in 0s \n", + "\n", + "2021-03-17 09:25:32 (87.9 MB/s) - 'laboratoria.wmi.amu.edu.pl/godziny-otwarcia/index.html' saved [5317/5317]\n", + "\n", + "--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/kontakt/\n", + "Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 4644 (4.5K) [text/html]\n", + "Saving to: 'laboratoria.wmi.amu.edu.pl/kontakt/index.html'\n", + "\n", + "laboratoria.wmi.amu 100%[===================>] 4.54K --.-KB/s in 0s \n", + "\n", + "2021-03-17 09:25:32 (142 MB/s) - 'laboratoria.wmi.amu.edu.pl/kontakt/index.html' saved [4644/4644]\n", + "\n", + "--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/pierwsze-kroki/\n", + "Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 6639 (6.5K) [text/html]\n", + "Saving to: 'laboratoria.wmi.amu.edu.pl/pierwsze-kroki/index.html'\n", + "\n", + "laboratoria.wmi.amu 100%[===================>] 6.48K --.-KB/s in 0.002s \n", + "\n", + "2021-03-17 09:25:32 (3.61 MB/s) - 'laboratoria.wmi.amu.edu.pl/pierwsze-kroki/index.html' saved [6639/6639]\n", + "\n", + "--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/przewodnik/\n", + "Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 5454 (5.3K) [text/html]\n", + "Saving to: 'laboratoria.wmi.amu.edu.pl/przewodnik/index.html'\n", + "\n", + "laboratoria.wmi.amu 100%[===================>] 5.33K --.-KB/s in 0.002s \n", + "\n", + "2021-03-17 09:25:32 (2.97 MB/s) - 'laboratoria.wmi.amu.edu.pl/przewodnik/index.html' saved [5454/5454]\n", + "\n", + "--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/regulamin-laboratoriow-komputerowych/\n", + "Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 14393 (14K) [text/html]\n", + "Saving to: 'laboratoria.wmi.amu.edu.pl/regulamin-laboratoriow-komputerowych/index.html'\n", + "\n", + "laboratoria.wmi.amu 100%[===================>] 14.06K --.-KB/s in 0.005s \n", + "\n", + "2021-03-17 09:25:32 (2.65 MB/s) - 'laboratoria.wmi.amu.edu.pl/regulamin-laboratoriow-komputerowych/index.html' saved [14393/14393]\n", + "\n", + "--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/nie-odpowiadamy/\n", + "Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 4481 (4.4K) [text/html]\n", + "Saving to: 'laboratoria.wmi.amu.edu.pl/nie-odpowiadamy/index.html'\n", + "\n", + "laboratoria.wmi.amu 100%[===================>] 4.38K --.-KB/s in 0s \n", + "\n", + "2021-03-17 09:25:32 (101 MB/s) - 'laboratoria.wmi.amu.edu.pl/nie-odpowiadamy/index.html' saved [4481/4481]\n", + "\n", + "--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/laboratoria/oprogramowanie/\n", + "Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 12821 (13K) [text/html]\n", + "Saving to: 'laboratoria.wmi.amu.edu.pl/laboratoria/oprogramowanie/index.html'\n", + "\n", + "laboratoria.wmi.amu 100%[===================>] 12.52K --.-KB/s in 0.004s \n", + "\n", + "2021-03-17 09:25:32 (2.93 MB/s) - 'laboratoria.wmi.amu.edu.pl/laboratoria/oprogramowanie/index.html' saved [12821/12821]\n", + "\n", + "--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/uslugi/\n", + "Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 10688 (10K) [text/html]\n", + "Saving to: 'laboratoria.wmi.amu.edu.pl/uslugi/index.html'\n", + "\n", + "laboratoria.wmi.amu 100%[===================>] 10.44K --.-KB/s in 0.004s \n", + "\n", + "2021-03-17 09:25:32 (2.74 MB/s) - 'laboratoria.wmi.amu.edu.pl/uslugi/index.html' saved [10688/10688]\n", + "\n", + "--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/uslugi-uniwersyteckie/\n", + "Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 4240 (4.1K) [text/html]\n", + "Saving to: 'laboratoria.wmi.amu.edu.pl/uslugi-uniwersyteckie/index.html'\n", + "\n", + "laboratoria.wmi.amu 100%[===================>] 4.14K --.-KB/s in 0.001s \n", + "\n", + "2021-03-17 09:25:32 (3.27 MB/s) - 'laboratoria.wmi.amu.edu.pl/uslugi-uniwersyteckie/index.html' saved [4240/4240]\n", + "\n", + "--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/problemy/docker/\n", + "Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 6326 (6.2K) [text/html]\n", + "Saving to: 'laboratoria.wmi.amu.edu.pl/problemy/docker/index.html'\n", + "\n", + "laboratoria.wmi.amu 100%[===================>] 6.18K --.-KB/s in 0s \n", + "\n", + "2021-03-17 09:25:32 (182 MB/s) - 'laboratoria.wmi.amu.edu.pl/problemy/docker/index.html' saved [6326/6326]\n", + "\n", + "--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/serwery-terminalowe/\n", + "Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 382 [text/html]\n", + "Saving to: 'laboratoria.wmi.amu.edu.pl/serwery-terminalowe/index.html'\n", + "\n", + "laboratoria.wmi.amu 100%[===================>] 382 --.-KB/s in 0s \n", + "\n", + "2021-03-17 09:25:32 (15.9 MB/s) - 'laboratoria.wmi.amu.edu.pl/serwery-terminalowe/index.html' saved [382/382]\n", + "\n", + "--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/vpn/\n", + "Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 334 [text/html]\n", + "Saving to: 'laboratoria.wmi.amu.edu.pl/vpn/index.html'\n", + "\n", + "laboratoria.wmi.amu 100%[===================>] 334 --.-KB/s in 0s \n", + "\n", + "2021-03-17 09:25:32 (16.3 MB/s) - 'laboratoria.wmi.amu.edu.pl/vpn/index.html' saved [334/334]\n", + "\n", + "--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/a126\n", + "Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n", + "HTTP request sent, awaiting response... 301 Moved Permanently\n", + "Location: https://laboratoria.wmi.amu.edu.pl/a126/ [following]\n", + "--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/a126/\n", + "Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 3671 (3.6K) [text/html]\n", + "Saving to: 'laboratoria.wmi.amu.edu.pl/a126'\n", + "\n", + "laboratoria.wmi.amu 100%[===================>] 3.58K --.-KB/s in 0s \n", + "\n", + "2021-03-17 09:25:32 (194 MB/s) - 'laboratoria.wmi.amu.edu.pl/a126' saved [3671/3671]\n", + "\n", + "--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/irc\n", + "Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n", + "HTTP request sent, awaiting response... 301 Moved Permanently\n", + "Location: https://laboratoria.wmi.amu.edu.pl/irc/ [following]\n", + "--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/irc/\n", + "Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 3946 (3.9K) [text/html]\n", + "Saving to: 'laboratoria.wmi.amu.edu.pl/irc'\n", + "\n", + "laboratoria.wmi.amu 100%[===================>] 3.85K --.-KB/s in 0s \n", + "\n", + "2021-03-17 09:25:32 (243 MB/s) - 'laboratoria.wmi.amu.edu.pl/irc' saved [3946/3946]\n", + "\n", + "--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/godziny-otwarcia\n", + "Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n", + "HTTP request sent, awaiting response... 301 Moved Permanently\n", + "Location: https://laboratoria.wmi.amu.edu.pl/godziny-otwarcia/ [following]\n", + "--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/godziny-otwarcia/\n", + "Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 5317 (5.2K) [text/html]\n", + "laboratoria.wmi.amu.edu.pl/godziny-otwarcia: Is a directory\n", + "\n", + "Cannot write to 'laboratoria.wmi.amu.edu.pl/godziny-otwarcia' (Is a directory).\n", + "--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/js/fix.js\n", + "Connecting to laboratoria.wmi.amu.edu.pl (laboratoria.wmi.amu.edu.pl)|150.254.78.3|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 62 [application/javascript]\n", + "Saving to: 'laboratoria.wmi.amu.edu.pl/js/fix.js'\n", + "\n", + "laboratoria.wmi.amu 100%[===================>] 62 --.-KB/s in 0s \n", + "\n", + "2021-03-17 09:25:32 (6.51 MB/s) - 'laboratoria.wmi.amu.edu.pl/js/fix.js' saved [62/62]\n", + "\n", + "FINISHED --2021-03-17 09:25:32--\n", + "Total wall clock time: 0.3s\n", + "Downloaded: 20 files, 115K in 0.03s (4.14 MB/s)\n" + ] + } + ], + "source": [ + "# Pobierz rekurencyjnie, z ograniczeniem do jednego poziomu rekurencji \n", + "! wget -r -l 1 https://laboratoria.wmi.amu.edu.pl/" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "http://www.almanachmuszyny.pl/spisy/1991/AM1991_02_muszynski_zamek_prawda_i_legenda.pdf\n", + " out=1991-1.pdf\n", + "http://www.almanachmuszyny.pl/spisy/1991/AM1991_03_muszyna_miasteczko_historyczne.pdf\n", + " out=1991-2.pdf\n", + "\n", + "03/17 09:31:54 [\u001b[1;32mNOTICE\u001b[0m] Downloading 2 item(s)\n", + "\n", + "03/17 09:31:55 [\u001b[1;32mNOTICE\u001b[0m] Download complete: /home/filipg/ext/amu/aitech-eks/wyk/aria2c-example/1991-1.pdf\n", + "\n", + "03/17 09:31:55 [\u001b[1;32mNOTICE\u001b[0m] Download complete: /home/filipg/ext/amu/aitech-eks/wyk/aria2c-example/1991-2.pdf\n", + "\n", + "Download Results:\n", + "gid |stat|avg speed |path/URI\n", + "======+====+===========+=======================================================\n", + "3bf8a7|\u001b[1;32mOK\u001b[0m | 458KiB/s|/home/filipg/ext/amu/aitech-eks/wyk/aria2c-example/1991-1.pdf\n", + "e0c4c1|\u001b[1;32mOK\u001b[0m | 677KiB/s|/home/filipg/ext/amu/aitech-eks/wyk/aria2c-example/1991-2.pdf\n", + "\n", + "Status Legend:\n", + "(OK):download completed.\n" + ] + } + ], + "source": [ + "# aria2c pozwala łatwo pobrać listę adresów URL, dla każdego adresu można ustawić specyficzne opcje\n", + "! (cd aria2c-example && cat aria.in)\n", + "! (cd aria2c-example && aria2c -i aria.in)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Biblioteki/frameworki do tworzenia robotów\n", + "\n", + "### Python \n", + "\n", + "Użyteczne biblioteki: \n", + "\n", + "* urllib\n", + "* request\n", + "* Beautiful Soup (do parsowania HTML-a)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[('/en/', 'English'), ('/', '\\n\\n Laboratoria Komputerowe\\n '), ('/', 'Strona główna'), ('/godziny-otwarcia/', 'Godziny otwarcia'), ('/kontakt/', 'Kontakt'), ('/pierwsze-kroki/', 'Pierwsze kroki'), ('/przewodnik/', 'Przewodnik po stronie'), ('/regulamin-laboratoriow-komputerowych/', 'Regulamin Wydziałowych Laboratoriów Komputerowych'), ('/nie-odpowiadamy/', 'Za co nie odpowiadamy'), ('/laboratoria/oprogramowanie/', 'Laboratoria'), ('/uslugi/', 'Usługi'), ('/uslugi-uniwersyteckie/', 'Usługi Uniwersyteckie'), ('/problemy/docker/', 'Problemy'), ('/serwery-terminalowe/', 'serwera terminalowego'), ('/vpn/', 'VPN'), ('https://help.wmi.amu.edu.pl/', 'https://help.wmi.amu.edu.pl/'), ('/a126', 'A1-26'), ('https://help.wmi.amu.edu.pl/', 'System helpdeskowy'), ('mailto:helpdesk@wmi.amu.edu.pl', 'helpdesk@wmi.amu.edu.pl'), ('/irc', 'users'), ('https://www.facebook.com/wmilabs/', 'Facebook'), ('/godziny-otwarcia', 'Godziny otwarcia')]\n" + ] + } + ], + "source": [ + "import urllib\n", + "import requests\n", + "from bs4 import BeautifulSoup\n", + "\n", + "url = 'https://laboratoria.wmi.amu.edu.pl/'\n", + "response = requests.get(url)\n", + "soup = BeautifulSoup(response.content, \"html.parser\")\n", + "\n", + "# wydobądź wszystkie linki (elementy A)\n", + "links = soup.find_all('a')\n", + "print([(link['href'], link.get_text()) for link in links])\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## XPath\n", + "\n", + "XPath – język służący do adresowania części dokumentu XML.\n", + "\n", + "* `/html/body/div/p` – pełna ścieżka do wszystkich akapitów wewnątrz głównych elementów `
`\n", + "* `//div/p` – wszystkie akapity w jakichkolwiek elementach `
`\n", + "* `//a/@href` - wartości atrybutu `href` dla wszystkich linków\n", + "* `//p[@id=’foo’]/img[5]` - piąty (indeksowanie od 1!) obrazek wewnątrz akapitu o identyfikatorze foo\n", + "* `//p[img]/a` - linki w akapitach zawierających obrazek\n", + "\n", + "Czym się różni:\n", + "\n", + "* `//img[3]` od `(//img)[3]` ?\n", + "* `//p[img]/a` od `//p[//img]/a` ?\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['/', '/godziny-otwarcia/', '/kontakt/', '/pierwsze-kroki/', '/przewodnik/', '/regulamin-laboratoriow-komputerowych/', '/nie-odpowiadamy/', '/laboratoria/oprogramowanie/', '/uslugi/', '/uslugi-uniwersyteckie/', '/problemy/docker/']\n" + ] + } + ], + "source": [ + "\n", + "from urllib.request import urlopen\n", + "from lxml import etree\n", + "\n", + "url = 'https://laboratoria.wmi.amu.edu.pl/'\n", + "\n", + "response = urlopen(url)\n", + "htmlparser = etree.HTMLParser()\n", + "tree = etree.parse(response, htmlparser)\n", + "# linki z panelu\n", + "links = tree.xpath(\"//div[@class='sidebar-menu']//a/@href\")\n", + "print(links)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "## Jak poradzić sobie z dynamicznymi stronami?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### HtmlUnit\n", + "\n", + "```\n", + "WebClient webClient = new WebClient();\n", + "HtmlPage page = webClient.getPage(\"http://ceti.pl/?ceti=administracja\");\n", + "\n", + "HtmlForm form = page.getForms().get(2);\n", + "\n", + "HtmlTextInput loginField = form.getInputByName(\"login\");\n", + "loginField.setValueAttribute(\"atrapa\");\n", + "HtmlPasswordInput passField = form.getInputByName(\"pass\");\n", + "passField.setValueAttribute(\"haslo1\");\n", + "\n", + "HtmlImageInput button = form.getInputByValue(\"OK\");\n", + "HtmlPage page2 = (HtmlPage)button.click();\n", + "\n", + "HtmlPage page3 = webClient.getPage(\"https://tau4.ceti.pl/cgi-bin/logs-user-show.cgi\");\n", + "System.out.println(page3.asXml());\n", + "\n", + "UnexpectedPage page4 = webClient.getPage(\"https://adm.tau4.ceti.pl/logs.zip\");\n", + "InputStream istr = page4.getInputStream();\n", + "``` \n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "## Selenium" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['https://www.python.org/community/sigs/guidelines', 'https://www.python.org/dev/peps/pep-0585/', 'https://www.python.org/community/lists', 'https://www.python.org/doc/essays/list2str', 'https://www.python.org/dev/core-mentorship', 'https://www.python.org/dev/peps/pep-3128/', 'https://www.python.org/dev/peps/pep-0204/', 'https://www.python.org/community/sigs/coordination', 'https://www.python.org/psf/committees', 'https://www.python.org/dev/peps/pep-0225/', 'https://www.python.org/dev/peps/pep-3132/', 'https://www.python.org/community/sigs/current/doc-sig/stext', 'https://www.python.org/dev/peps/pep-0202/', 'https://www.python.org/dev/peps/pep-0274/', 'https://www.python.org/dev/peps/pep-0469/', 'https://www.python.org/dev/peps/pep-0289/', 'https://www.python.org/dev/peps/pep-0270/', 'https://www.python.org/community/sigs/retired/string-sig', 'https://www.python.org/community/sigs/retired/progenv-sig', 'https://www.python.org/psf/records/board/minutes/2005-02-08']\n" + ] + } + ], + "source": [ + "# należy wcześniej uruchomić serwer selenium\n", + "# wget https://selenium-release.storage.googleapis.com/3.141/selenium-server-standalone-3.141.59.jar\n", + "# java -jar selenium-server-standalone-3.141.59.jar\n", + "\n", + "from selenium import webdriver\n", + "from selenium.webdriver.common.desired_capabilities import DesiredCapabilities\n", + "from selenium.webdriver.common.keys import Keys\n", + "from selenium.webdriver.common.by import By\n", + "\n", + "driver = webdriver.Remote(\n", + " command_executor='http://127.0.0.1:4444/wd/hub',\n", + " desired_capabilities=DesiredCapabilities.CHROME)\n", + "\n", + "driver.get(\"http://www.python.org\")\n", + "assert \"Python\" in driver.title\n", + "elem = driver.find_element_by_name(\"q\")\n", + "elem.clear()\n", + "elem.send_keys(\"list\")\n", + "elem.send_keys(Keys.RETURN)\n", + "links = driver.find_elements(By.XPATH, '//h3/a')\n", + "print([l.get_attribute('href') for l in links])\n", + "driver.close()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Haskell i strzałki\n", + "\n", + "W języku Haskell można tworzyć roboty używając biblioteki HXT opartym na formalizmie strzałek (ang. _arrows_).\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/wyk/aria2c-example/aria.in b/wyk/aria2c-example/aria.in new file mode 100644 index 0000000..c6a14c7 --- /dev/null +++ b/wyk/aria2c-example/aria.in @@ -0,0 +1,4 @@ +http://www.almanachmuszyny.pl/spisy/1991/AM1991_02_muszynski_zamek_prawda_i_legenda.pdf + out=1991-1.pdf +http://www.almanachmuszyny.pl/spisy/1991/AM1991_03_muszyna_miasteczko_historyczne.pdf + out=1991-2.pdf