diff --git a/wyk/01_Wyszukiwarki-wprowadzenie.ipynb b/wyk/01_Wyszukiwarki-wprowadzenie.ipynb index 9ede925..6015e5f 100644 --- a/wyk/01_Wyszukiwarki-wprowadzenie.ipynb +++ b/wyk/01_Wyszukiwarki-wprowadzenie.ipynb @@ -6,7 +6,7 @@ "source": [ "# Wyszukiwarki - wprowadzenie\n", "\n", - "## Systemy wyszukiwania informacji\n", + "## Systemy wyszukiwania informacji (information retrieval systems)\n", "\n", "![System wyszukiwania informacji](system-wyszukiwania-informacji.png)" ] @@ -800,6 +800,876 @@ "* aplikacja pozwala wylistować wszystkie wyniki oznaczone do tej pory jako interesujące" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Czego nie brać?\n", + "\n", + "Standard robots.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "User-agent: *\n", + "Disallow: /*/wyszukaj/\n", + "Disallow: /*servlet\n", + "Disallow: /reloadwww?\n", + "Disallow: /dfptools/adview/\n", + "Disallow: /pub/ips/*\n", + "Disallow: /ods?\n", + "Disallow: /getFile.servlet*\n", + "Disallow: /aliasy/blad.jsp\n", + "Disallow: /znajdz.do\n", + "Disallow: /portalSearch.do\n", + "Disallow: /im/ab/b4/10/z17515435Q.jpg\n", + "Disallow: /75224259/\n", + "\n", + "User-agent: Googlebot-News\n", + "Disallow: /nowy/\n", + "Disallow: /mapa_strony\n", + "Disallow: /*/wyszukaj/\n", + "Disallow: /*/51,\n", + "Disallow: /*/55,\n", + "Disallow: /*/2,\n", + "Disallow: /*order=\n", + "Disallow: /*obxx=\n", + "Disallow: /*tag=\n", + "Disallow: /reloadwww?\n", + "Disallow: /ods?\n", + "Disallow: /*servlet\n", + "Disallow: /dfptools/adview/\n", + "\n", + "User-agent: Yandex\n", + "Disallow: /\n", + "\n", + "User-Agent: bingbot\n", + "Disallow: /\n", + "\n", + "User-agent: 008\n", + "Disallow: /\n", + "\n", + "User-agent: 010\n", + "Disallow: /\n", + "\n", + "User-agent: 360Spider\n", + "Disallow: /\n", + "\n", + "User-agent: 80legs\n", + "Disallow: /\n", + "\n", + "User-agent: Aboundex\n", + "Disallow: /\n", + "\n", + "User-agent: accelobot\n", + "Disallow: /\n", + "\n", + "User-agent: Add\\ Catalog\n", + "Disallow: /\n", + "\n", + "User-agent: AhrefsBot\n", + "Disallow: /\n", + "\n", + "User-agent: aiHitBot\n", + "Disallow: /\n", + "\n", + "User-agent: Alexibot\n", + "Disallow: /\n", + "\n", + "User-agent: Aqua_Products\n", + "Disallow: /\n", + "\n", + "User-agent: AskJeeves\n", + "Disallow: /\n", + "\n", + "User-agent: asterias\n", + "Disallow: /\n", + "\n", + "User-agent: awcheckBot\n", + "Disallow: /\n", + "\n", + "User-agent: b2w/0.1\n", + "Disallow: /\n", + "\n", + "User-agent: BackDoorBot/1.0\n", + "Disallow: /\n", + "\n", + "User-agent: BacklinkCrawler\n", + "Disallow: /\n", + "\n", + "User-agent: Baiduspider\n", + "Disallow: /\n", + "\n", + "User-agent: BecomeBot\n", + "Disallow: /\n", + "\n", + "User-agent: BLEXBot\n", + "Disallow: /\n", + "\n", + "User-agent: BlowFish/1.0\n", + "Disallow: /\n", + "\n", + "User-agent: Bookmark search tool\n", + "Disallow: /\n", + "\n", + "User-agent: BotALot\n", + "Disallow: /\n", + "\n", + "User-agent: brandwatch.net\n", + "Disallow: /\n", + "\n", + "User-agent: BuiltBotTough\n", + "Disallow: /\n", + "\n", + "User-agent: Bullseye/1.0\n", + "Disallow: /\n", + "\n", + "User-agent: BunnySlippers\n", + "Disallow: /\n", + "\n", + "User-agent: Butterfly\n", + "Disallow: /\n", + "\n", + "User-agent: CatchBot\n", + "Disallow: /\n", + "\n", + "User-agent: Charlotte\n", + "Disallow: /\n", + "\n", + "User-agent: CheeseBot\n", + "Disallow: /\n", + "\n", + "User-agent: CherryPicker\n", + "Disallow: /\n", + "\n", + "User-agent: CherryPickerElite/1.0\n", + "Disallow: /\n", + "\n", + "User-agent: CherryPickerSE/1.0\n", + "Disallow: /\n", + "\n", + "User-agent: CLIPish\n", + "Disallow: /\n", + "\n", + "User-agent: Cliqzbot\n", + "Disallow: /\n", + "\n", + "User-agent: COMODO\n", + "Disallow: /\n", + "\n", + "User-agent: Comodo-Certificates-Spider\n", + "Disallow: /\n", + "\n", + "User-agent: CompSpyBot\n", + "Disallow: /\n", + "\n", + "User-agent: Copernic\n", + "Disallow: /\n", + "\n", + "User-agent: CopyRightCheck\n", + "Disallow: /\n", + "\n", + "User-agent: cosmos\n", + "Disallow: /\n", + "\n", + "User-agent: crawler\n", + "Disallow: /\n", + "\n", + "User-agent: Crescent\n", + "Disallow: /\n", + "\n", + "User-agent: Crescent Internet ToolPak HTTP OLE Control v.1.0\n", + "Disallow: /\n", + "\n", + "User-agent: Curious\n", + "Disallow: /\n", + "\n", + "User-agent: curl\n", + "Disallow: /\n", + "\n", + "User-agent: dataprovider\\.com\n", + "Disallow: /\n", + "\n", + "User-agent: DinoPing\n", + "Disallow: /\n", + "\n", + "User-agent: discoverybot\n", + "Disallow: /\n", + "\n", + "User-agent: DittoSpyder\n", + "Disallow: /\n", + "\n", + "User-agent: DomainCrawler\n", + "Disallow: /\n", + "\n", + "User-agent: DomainCrawler\n", + "Disallow: /\n", + "\n", + "User-agent: dotbot\n", + "Disallow: /\n", + "\n", + "User-agent: dotnetdotcom\n", + "Disallow: /\n", + "\n", + "User-agent: Dow\\ Jones\\ Searchbot\n", + "Disallow: /\n", + "\n", + "User-agent: dumbot\n", + "Disallow: /\n", + "\n", + "User-agent: EasouSpider\n", + "Disallow: /\n", + "\n", + "User-agent: EmailCollector\n", + "Disallow: /\n", + "\n", + "User-agent: EmailSiphon\n", + "Disallow: /\n", + "\n", + "User-agent: EmailWolf\n", + "Disallow: /\n", + "\n", + "User-agent: Enterprise_Search\n", + "Disallow: /\n", + "\n", + "User-agent: Enterprise_Search/1.0\n", + "Disallow: /\n", + "\n", + "User-agent: EroCrawler\n", + "Disallow: /\n", + "\n", + "User-agent: es\n", + "Disallow: /\n", + "\n", + "User-agent: Exabot\n", + "Disallow: /\n", + "\n", + "User-agent: ExtractorPro\n", + "Disallow: /\n", + "\n", + "User-agent: EzineArticlesLinkScanner\n", + "Disallow: /\n", + "\n", + "User-agent: Ezooms\n", + "Disallow: /\n", + "\n", + "User-agent: FairAd Client\n", + "Disallow: /\n", + "\n", + "User-agent: Flaming AttackBot\n", + "Disallow: /\n", + "\n", + "User-agent: Foobot\n", + "Disallow: /\n", + "\n", + "User-agent: FreeFind\n", + "Disallow: /\n", + "\n", + "User-agent: FTRF\\:\\ Friendly\n", + "Disallow: /\n", + "\n", + "User-agent: Gaisbot\n", + "Disallow: /\n", + "\n", + "User-agent: GetRight/4.2\n", + "Disallow: /\n", + "\n", + "User-agent: gigabot\n", + "Disallow: /\n", + "\n", + "User-agent: grub\n", + "Disallow: /\n", + "\n", + "User-agent: grub-client\n", + "Disallow: /\n", + "\n", + "User-agent: Harvest/1.5\n", + "Disallow: /\n", + "\n", + "User-agent: Hatena Antenna\n", + "Disallow: /\n", + "\n", + "User-agent: hloader\n", + "Disallow: /\n", + "\n", + "User-agent: http://www.SearchEngineWorld.com bot\n", + "Disallow: /\n", + "\n", + "User-agent: http://www.WebmasterWorld.com bot\n", + "Disallow: /\n", + "\n", + "User-agent: HTTP_Request\n", + "Disallow: /\n", + "\n", + "User-agent: HTTP_Request2\n", + "Disallow: /\n", + "\n", + "User-agent: httplib\n", + "Disallow: /\n", + "\n", + "User-agent: humanlinks\n", + "Disallow: /\n", + "\n", + "User-agent: ia_archiver\n", + "Disallow: /\n", + "\n", + "User-agent: ia_archiver\n", + "Disallow: /\n", + "\n", + "User-agent: ia_archiver/1.6\n", + "Disallow: /\n", + "\n", + "User-agent: Indy\\ Library\n", + "Disallow: /\n", + "\n", + "User-agent: InfoNaviRobot\n", + "Disallow: /\n", + "\n", + "User-agent: ip\\-web\\-crawler\\.com\n", + "Disallow: /\n", + "\n", + "User-agent: Iron33/1.0.2\n", + "Disallow: /\n", + "\n", + "User-agent: Jakarta\\ Commons-HttpClient\n", + "Disallow: /\n", + "\n", + "User-agent: Jeeves\n", + "Disallow: /\n", + "\n", + "User-agent: JennyBot\n", + "Disallow: /\n", + "\n", + "User-agent: Jetbot\n", + "Disallow: /\n", + "\n", + "User-agent: Jetbot/1.0\n", + "Disallow: /\n", + "\n", + "User-agent: JikeSpider\n", + "Disallow: /\n", + "\n", + "User-agent: Kenjin Spider\n", + "Disallow: /\n", + "\n", + "User-agent: Keyword Density/0.9\n", + "Disallow: /\n", + "\n", + "User-agent: larbin\n", + "Disallow: /\n", + "\n", + "User-agent: LexiBot\n", + "Disallow: /\n", + "\n", + "User-agent: libWeb/clsHTTP\n", + "Disallow: /\n", + "\n", + "User-agent: libwww-perl\n", + "Disallow: /\n", + "\n", + "User-agent: lindex\\.com\n", + "Disallow: /\n", + "\n", + "User-agent: linkdex\\.com\n", + "Disallow: /\n", + "\n", + "User-agent: linkdexbot\n", + "Disallow: /\n", + "\n", + "User-agent: LinkextractorPro\n", + "Disallow: /\n", + "\n", + "User-agent: LinkScan/8.1a Unix\n", + "Disallow: /\n", + "\n", + "User-agent: LinkWalker\n", + "Disallow: /\n", + "\n", + "User-agent: lipperhey\n", + "Disallow: /\n", + "\n", + "User-agent: LNSpiderguy\n", + "Disallow: /\n", + "\n", + "User-agent: looksmart\n", + "Disallow: /\n", + "\n", + "User-agent: ltbot\n", + "Disallow: /\n", + "\n", + "User-agent: lwp-trivial\n", + "Disallow: /\n", + "\n", + "User-agent: lwp-trivial/1.34\n", + "Disallow: /\n", + "\n", + "User-agent: Lynx\n", + "Disallow: /\n", + "\n", + "User-agent: magpie\\-crawler\n", + "Disallow: /\n", + "\n", + "User-agent: Mata Hari\n", + "Disallow: /\n", + "\n", + "User-agent: Microsoft URL Control\n", + "Disallow: /\n", + "\n", + "User-agent: Microsoft URL Control - 5.01.4511\n", + "Disallow: /\n", + "\n", + "User-agent: Microsoft URL Control - 6.00.8169\n", + "Disallow: /\n", + "\n", + "User-agent: MIIxpc\n", + "Disallow: /\n", + "\n", + "User-agent: MIIxpc/4.2\n", + "Disallow: /\n", + "\n", + "User-agent: Mister PiX\n", + "Disallow: /\n", + "\n", + "User-agent: MJ12bot\n", + "Disallow: /\n", + "\n", + "User-agent: moget\n", + "Disallow: /\n", + "\n", + "User-agent: moget/2.1\n", + "Disallow: /\n", + "\n", + "User-agent: Mozilla/4.0 (compatible; BullsEye; Windows 95)\n", + "Disallow: /\n", + "\n", + "User-agent: MSIE\\ or\\ Firefox\\ mutant\n", + "Disallow: /\n", + "\n", + "User-agent: MSIECrawler\n", + "Disallow: /\n", + "\n", + "User-agent: naver\n", + "Disallow: /\n", + "\n", + "User-agent: NCBot\n", + "Disallow: /\n", + "\n", + "User-agent: NetAnts\n", + "Disallow: /\n", + "\n", + "User-agent: NetcraftSurveyAgent\n", + "Disallow: /\n", + "\n", + "User-agent: netEstate\\ NE\\ Crawler\n", + "Disallow: /\n", + "\n", + "User-agent: NetMechanic\n", + "Disallow: /\n", + "\n", + "User-agent: Netseer\n", + "Disallow: /\n", + "\n", + "User-agent: NextGenSearchBot\n", + "Disallow: /\n", + "\n", + "User-agent: NICErsPRO\n", + "Disallow: /\n", + "\n", + "User-agent: Nutch\n", + "Disallow: /\n", + "\n", + "User-agent: Nutch\n", + "Disallow: /\n", + "\n", + "User-agent: Ocelli\n", + "Disallow: /\n", + "\n", + "User-agent: Offline Explorer\n", + "Disallow: /\n", + "\n", + "User-agent: OmniExplorer_Bot\n", + "Disallow: /\n", + "\n", + "User-agent: Openbot\n", + "Disallow: /\n", + "\n", + "User-agent: Openfind\n", + "Disallow: /\n", + "\n", + "User-agent: Openfind\n", + "Disallow: /\n", + "\n", + "User-agent: Openfind data gathere\n", + "Disallow: /\n", + "\n", + "User-agent: OpenWebIndex\n", + "Disallow: /\n", + "\n", + "User-agent: Oracle Ultra Search\n", + "Disallow: /\n", + "\n", + "User-agent: PagesInventory\n", + "Disallow: /\n", + "\n", + "User-agent: PEAR\n", + "Disallow: /\n", + "\n", + "User-agent: PeoplePal\n", + "Disallow: /\n", + "\n", + "User-agent: PerMan\n", + "Disallow: /\n", + "\n", + "User-agent: ProCogSEOBot\n", + "Disallow: /\n", + "\n", + "User-agent: ProPowerBot/2.14\n", + "Disallow: /\n", + "\n", + "User-agent: ProWebWalker\n", + "Disallow: /\n", + "\n", + "User-agent: proximic\n", + "Disallow: /\n", + "\n", + "User-agent: psbot\n", + "Disallow: /\n", + "\n", + "User-agent: purebot\n", + "Disallow: /\n", + "\n", + "User-agent: QueryN Metasearch\n", + "Disallow: /\n", + "\n", + "User-agent: QuerySeekerSpider\n", + "Disallow: /\n", + "\n", + "User-agent: Radiation Retriever 1.1\n", + "Disallow: /\n", + "\n", + "User-agent: RepoMonkey\n", + "Disallow: /\n", + "\n", + "User-agent: RepoMonkey Bait & Tackle/v1.01\n", + "Disallow: /\n", + "\n", + "User-agent: Riddler\n", + "Disallow: /\n", + "\n", + "User-agent: RMA\n", + "Disallow: /\n", + "\n", + "User-agent: rojerbot\n", + "Disallow: /\n", + "\n", + "User-agent: RyteBot\n", + "Disallow: /\n", + "\n", + "User-agent: scooter\n", + "Disallow: /\n", + "\n", + "User-agent: ScoutJet\n", + "Disallow: /\n", + "\n", + "User-agent: Scrapy\n", + "Disallow: /\n", + "\n", + "User-agent: ScreenerBot\n", + "Disallow: /\n", + "\n", + "User-agent: searchmetrics\n", + "Disallow: /\n", + "\n", + "User-agent: searchpreview\n", + "Disallow: /\n", + "\n", + "User-agent: SemrushBot\n", + "Disallow: /\n", + "\n", + "User-agent: sentibot\n", + "Disallow: /\n", + "\n", + "User-agent: SEO-CRAWLING\n", + "Disallow: /\n", + "\n", + "User-agent: SEOENGWorldBot\n", + "Disallow: /\n", + "\n", + "User-agent: SEOkicks-Robot\n", + "Disallow: /\n", + "\n", + "User-agent: ShopWiki\n", + "Disallow: /\n", + "\n", + "User-agent: sistrix\n", + "Disallow: /\n", + "\n", + "User-agent: sitebot\n", + "Disallow: /\n", + "\n", + "User-agent: SiteSnagger\n", + "Disallow: /\n", + "\n", + "User-agent: Snoopy\n", + "Disallow: /\n", + "\n", + "User-agent: SocialSearcher\n", + "Disallow: /\n", + "\n", + "User-agent: Sogou\n", + "Disallow: /\n", + "\n", + "User-agent: SolomonoBot\n", + "Disallow: /\n", + "\n", + "User-agent: sootle\n", + "Disallow: /\n", + "\n", + "User-agent: Sosospider\n", + "Disallow: /\n", + "\n", + "User-agent: SpankBot\n", + "Disallow: /\n", + "\n", + "User-agent: spanner\n", + "Disallow: /\n", + "\n", + "User-agent: spbot\n", + "Disallow: /\n", + "\n", + "User-agent: Speedy\n", + "Disallow: /\n", + "\n", + "User-agent: Stanford\n", + "Disallow: /\n", + "\n", + "User-agent: Stanford Comp Sci\n", + "Disallow: /\n", + "\n", + "User-agent: SurveyBot\n", + "Disallow: /\n", + "\n", + "User-agent: suzuran\n", + "Disallow: /\n", + "\n", + "User-agent: Szukacz/1.4\n", + "Disallow: /\n", + "\n", + "User-agent: Szukacz/1.4\n", + "Disallow: /\n", + "\n", + "User-agent: Teleport\n", + "Disallow: /\n", + "\n", + "User-agent: TeleportPro\n", + "Disallow: /\n", + "\n", + "User-agent: Telesoft\n", + "Disallow: /\n", + "\n", + "User-agent: Teoma\n", + "Disallow: /\n", + "\n", + "User-agent: The Intraformant\n", + "Disallow: /\n", + "\n", + "User-agent: The\\ Incutio\\ XML-RPC\\ PHP\\ Library\n", + "Disallow: /\n", + "\n", + "User-agent: TheNomad\n", + "Disallow: /\n", + "\n", + "User-agent: toCrawl/UrlDispatcher\n", + "Disallow: /\n", + "\n", + "User-agent: True_Robot\n", + "Disallow: /\n", + "\n", + "User-agent: True_Robot/1.0\n", + "Disallow: /\n", + "\n", + "User-agent: turingos\n", + "Disallow: /\n", + "\n", + "User-agent: TurnitinBot\n", + "Disallow: /\n", + "\n", + "User-agent: uCrawler\n", + "Disallow: /\n", + "\n", + "User-agent: URL Control\n", + "Disallow: /\n", + "\n", + "User-agent: URL_Spider_Pro\n", + "Disallow: /\n", + "\n", + "User-agent: URLy Warning\n", + "Disallow: /\n", + "\n", + "User-agent: VCI\n", + "Disallow: /\n", + "\n", + "User-agent: VCI WebViewer VCI WebViewer Win32\n", + "Disallow: /\n", + "\n", + "User-agent: visaduhoc\\.info\n", + "Disallow: /\n", + "\n", + "User-agent: WBSearchBot\n", + "Disallow: /\n", + "\n", + "User-agent: Web Image Collector\n", + "Disallow: /\n", + "\n", + "User-agent: WebAuto\n", + "Disallow: /\n", + "\n", + "User-agent: WebBandit\n", + "Disallow: /\n", + "\n", + "User-agent: WebBandit/3.50\n", + "Disallow: /\n", + "\n", + "User-agent: WebCapture\n", + "Disallow: /\n", + "\n", + "User-agent: WebCopier\n", + "Disallow: /\n", + "\n", + "User-agent: WebEnhancer\n", + "Disallow: /\n", + "\n", + "User-agent: WebInDetail\\.com\n", + "Disallow: /\n", + "\n", + "User-agent: WebmasterWorld Extractor\n", + "Disallow: /\n", + "\n", + "User-agent: WebmasterWorldForumBot\n", + "Disallow: /\n", + "\n", + "User-agent: WebSauger\n", + "Disallow: /\n", + "\n", + "User-agent: Website Quester\n", + "Disallow: /\n", + "\n", + "User-agent: WEBSITEtheWEB\\.COM\n", + "Disallow: /\n", + "\n", + "User-agent: Webster Pro\n", + "Disallow: /\n", + "\n", + "User-agent: WebStripper\n", + "Disallow: /\n", + "\n", + "User-agent: WebVac\n", + "Disallow: /\n", + "\n", + "User-agent: WebZip\n", + "Disallow: /\n", + "\n", + "User-agent: WebZip/4.0\n", + "Disallow: /\n", + "\n", + "User-agent: Wget\n", + "Disallow: /\n", + "\n", + "User-agent: Wget/1.5.3\n", + "Disallow: /\n", + "\n", + "User-agent: Wget/1.6\n", + "Disallow: /\n", + "\n", + "User-agent: Wotbot\n", + "Disallow: /\n", + "\n", + "User-agent: www\\.integromedb\\.org\n", + "Disallow: /\n", + "\n", + "User-agent: WWW-Collector-E\n", + "Disallow: /\n", + "\n", + "User-agent: Xenu's\n", + "Disallow: /\n", + "\n", + "User-agent: Xenu's Link Sleuth 1.1c\n", + "Disallow: /\n", + "\n", + "User-agent: xpymep\\.exe\n", + "Disallow: /\n", + "\n", + "User-agent: YamanaLab-Robot\n", + "Disallow: /\n", + "\n", + "User-agent: YisouSpider\n", + "Disallow: /\n", + "\n", + "User-agent: YodaoBot\n", + "Disallow: /\n", + "\n", + "User-agent: YoudaoBot\n", + "Disallow: /\n", + "\n", + "User-agent: Zend_Http_Client\n", + "Disallow: /\n", + "\n", + "User-agent: Zeus\n", + "Disallow: /\n", + "\n", + "User-agent: Zeus 32297 Webster Pro V2.9 Win32\n", + "Disallow: /\n", + "\n", + "User-agent: Zeus Link Scout\n", + "Disallow: /\n", + "\n", + "User-agent: ZmEu\n", + "Disallow: /\n", + "\n", + "User-agent: ZumBot\n", + "Disallow: /\n", + "\n", + "User-agent: Linguee\n", + "Disallow: /\n", + "\n", + "User-agent: sogou\n", + "Disallow: /\n" + ] + } + ], + "source": [ + "import urllib\n", + "import requests\n", + "\n", + "url = 'https://gazeta.pl/robots.txt'\n", + "response = requests.get(url)\n", + "print(response.content.decode('utf-8'))\n", + "\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Projekt 2\n", + "\n", + "Opracować wyszukiwarkę plików robots.txt.\n", + "\n", + "* pobrać robots.txt dla (prawie) wszystkich polskich stron WWW\n", + "* umożliwić wyszukiwanie i sortowanie według wszystkich możliwych pól (blokowana wyszukiwarka, adres, komentarz,\n", + "długość pliku itd.)\n", + "* opracować miary pozwalające automatycznie wyłuskać „ciekawe” pliki robots.txt (długość, występowanie pełnych\n", + "linków, odmienność od innych plików robots.txt); umożliwić sortowanie/filtrowanie według tej miary" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/wyk/02_Wyszukiwarki-roboty.ipynb b/wyk/02_Wyszukiwarki-roboty.ipynb new file mode 100644 index 0000000..4e207ba --- /dev/null +++ b/wyk/02_Wyszukiwarki-roboty.ipynb @@ -0,0 +1,521 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Jak stworzyć swojego robota?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Narzędzia uruchamiane z wiersza poleceń\n", + "\n", + "* wget\n", + "* curl\n", + "* aria2c" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/\n", + "Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'\n", + "Resolving laboratoria.wmi.amu.edu.pl (laboratoria.wmi.amu.edu.pl)... 150.254.78.3\n", + "Connecting to laboratoria.wmi.amu.edu.pl (laboratoria.wmi.amu.edu.pl)|150.254.78.3|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 6269 (6.1K) [text/html]\n", + "Saving to: 'laboratoria.wmi.amu.edu.pl/index.html'\n", + "\n", + "laboratoria.wmi.amu 100%[===================>] 6.12K --.-KB/s in 0.001s \n", + "\n", + "2021-03-17 09:25:32 (4.19 MB/s) - 'laboratoria.wmi.amu.edu.pl/index.html' saved [6269/6269]\n", + "\n", + "Loading robots.txt; please ignore errors.\n", + "--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/robots.txt\n", + "Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n", + "HTTP request sent, awaiting response... 403 Forbidden\n", + "2021-03-17 09:25:32 ERROR 403: Forbidden.\n", + "\n", + "--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/page-resources/wmi.png\n", + "Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 596 [image/png]\n", + "Saving to: 'laboratoria.wmi.amu.edu.pl/page-resources/wmi.png'\n", + "\n", + "laboratoria.wmi.amu 100%[===================>] 596 --.-KB/s in 0s \n", + "\n", + "2021-03-17 09:25:32 (53.7 MB/s) - 'laboratoria.wmi.amu.edu.pl/page-resources/wmi.png' saved [596/596]\n", + "\n", + "--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/css/labs.css\n", + "Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 6919 (6.8K) [text/css]\n", + "Saving to: 'laboratoria.wmi.amu.edu.pl/css/labs.css'\n", + "\n", + "laboratoria.wmi.amu 100%[===================>] 6.76K --.-KB/s in 0s \n", + "\n", + "2021-03-17 09:25:32 (18.5 MB/s) - 'laboratoria.wmi.amu.edu.pl/css/labs.css' saved [6919/6919]\n", + "\n", + "--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/en/\n", + "Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 5946 (5.8K) [text/html]\n", + "Saving to: 'laboratoria.wmi.amu.edu.pl/en/index.html'\n", + "\n", + "laboratoria.wmi.amu 100%[===================>] 5.81K --.-KB/s in 0.002s \n", + "\n", + "2021-03-17 09:25:32 (3.04 MB/s) - 'laboratoria.wmi.amu.edu.pl/en/index.html' saved [5946/5946]\n", + "\n", + "--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/page-resources/wmi_transparent.png\n", + "Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 15034 (15K) [image/png]\n", + "Saving to: 'laboratoria.wmi.amu.edu.pl/page-resources/wmi_transparent.png'\n", + "\n", + "laboratoria.wmi.amu 100%[===================>] 14.68K --.-KB/s in 0.005s \n", + "\n", + "2021-03-17 09:25:32 (2.62 MB/s) - 'laboratoria.wmi.amu.edu.pl/page-resources/wmi_transparent.png' saved [15034/15034]\n", + "\n", + "--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/godziny-otwarcia/\n", + "Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 5317 (5.2K) [text/html]\n", + "Saving to: 'laboratoria.wmi.amu.edu.pl/godziny-otwarcia/index.html'\n", + "\n", + "laboratoria.wmi.amu 100%[===================>] 5.19K --.-KB/s in 0s \n", + "\n", + "2021-03-17 09:25:32 (87.9 MB/s) - 'laboratoria.wmi.amu.edu.pl/godziny-otwarcia/index.html' saved [5317/5317]\n", + "\n", + "--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/kontakt/\n", + "Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 4644 (4.5K) [text/html]\n", + "Saving to: 'laboratoria.wmi.amu.edu.pl/kontakt/index.html'\n", + "\n", + "laboratoria.wmi.amu 100%[===================>] 4.54K --.-KB/s in 0s \n", + "\n", + "2021-03-17 09:25:32 (142 MB/s) - 'laboratoria.wmi.amu.edu.pl/kontakt/index.html' saved [4644/4644]\n", + "\n", + "--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/pierwsze-kroki/\n", + "Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 6639 (6.5K) [text/html]\n", + "Saving to: 'laboratoria.wmi.amu.edu.pl/pierwsze-kroki/index.html'\n", + "\n", + "laboratoria.wmi.amu 100%[===================>] 6.48K --.-KB/s in 0.002s \n", + "\n", + "2021-03-17 09:25:32 (3.61 MB/s) - 'laboratoria.wmi.amu.edu.pl/pierwsze-kroki/index.html' saved [6639/6639]\n", + "\n", + "--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/przewodnik/\n", + "Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 5454 (5.3K) [text/html]\n", + "Saving to: 'laboratoria.wmi.amu.edu.pl/przewodnik/index.html'\n", + "\n", + "laboratoria.wmi.amu 100%[===================>] 5.33K --.-KB/s in 0.002s \n", + "\n", + "2021-03-17 09:25:32 (2.97 MB/s) - 'laboratoria.wmi.amu.edu.pl/przewodnik/index.html' saved [5454/5454]\n", + "\n", + "--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/regulamin-laboratoriow-komputerowych/\n", + "Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 14393 (14K) [text/html]\n", + "Saving to: 'laboratoria.wmi.amu.edu.pl/regulamin-laboratoriow-komputerowych/index.html'\n", + "\n", + "laboratoria.wmi.amu 100%[===================>] 14.06K --.-KB/s in 0.005s \n", + "\n", + "2021-03-17 09:25:32 (2.65 MB/s) - 'laboratoria.wmi.amu.edu.pl/regulamin-laboratoriow-komputerowych/index.html' saved [14393/14393]\n", + "\n", + "--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/nie-odpowiadamy/\n", + "Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 4481 (4.4K) [text/html]\n", + "Saving to: 'laboratoria.wmi.amu.edu.pl/nie-odpowiadamy/index.html'\n", + "\n", + "laboratoria.wmi.amu 100%[===================>] 4.38K --.-KB/s in 0s \n", + "\n", + "2021-03-17 09:25:32 (101 MB/s) - 'laboratoria.wmi.amu.edu.pl/nie-odpowiadamy/index.html' saved [4481/4481]\n", + "\n", + "--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/laboratoria/oprogramowanie/\n", + "Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 12821 (13K) [text/html]\n", + "Saving to: 'laboratoria.wmi.amu.edu.pl/laboratoria/oprogramowanie/index.html'\n", + "\n", + "laboratoria.wmi.amu 100%[===================>] 12.52K --.-KB/s in 0.004s \n", + "\n", + "2021-03-17 09:25:32 (2.93 MB/s) - 'laboratoria.wmi.amu.edu.pl/laboratoria/oprogramowanie/index.html' saved [12821/12821]\n", + "\n", + "--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/uslugi/\n", + "Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 10688 (10K) [text/html]\n", + "Saving to: 'laboratoria.wmi.amu.edu.pl/uslugi/index.html'\n", + "\n", + "laboratoria.wmi.amu 100%[===================>] 10.44K --.-KB/s in 0.004s \n", + "\n", + "2021-03-17 09:25:32 (2.74 MB/s) - 'laboratoria.wmi.amu.edu.pl/uslugi/index.html' saved [10688/10688]\n", + "\n", + "--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/uslugi-uniwersyteckie/\n", + "Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 4240 (4.1K) [text/html]\n", + "Saving to: 'laboratoria.wmi.amu.edu.pl/uslugi-uniwersyteckie/index.html'\n", + "\n", + "laboratoria.wmi.amu 100%[===================>] 4.14K --.-KB/s in 0.001s \n", + "\n", + "2021-03-17 09:25:32 (3.27 MB/s) - 'laboratoria.wmi.amu.edu.pl/uslugi-uniwersyteckie/index.html' saved [4240/4240]\n", + "\n", + "--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/problemy/docker/\n", + "Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 6326 (6.2K) [text/html]\n", + "Saving to: 'laboratoria.wmi.amu.edu.pl/problemy/docker/index.html'\n", + "\n", + "laboratoria.wmi.amu 100%[===================>] 6.18K --.-KB/s in 0s \n", + "\n", + "2021-03-17 09:25:32 (182 MB/s) - 'laboratoria.wmi.amu.edu.pl/problemy/docker/index.html' saved [6326/6326]\n", + "\n", + "--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/serwery-terminalowe/\n", + "Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 382 [text/html]\n", + "Saving to: 'laboratoria.wmi.amu.edu.pl/serwery-terminalowe/index.html'\n", + "\n", + "laboratoria.wmi.amu 100%[===================>] 382 --.-KB/s in 0s \n", + "\n", + "2021-03-17 09:25:32 (15.9 MB/s) - 'laboratoria.wmi.amu.edu.pl/serwery-terminalowe/index.html' saved [382/382]\n", + "\n", + "--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/vpn/\n", + "Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 334 [text/html]\n", + "Saving to: 'laboratoria.wmi.amu.edu.pl/vpn/index.html'\n", + "\n", + "laboratoria.wmi.amu 100%[===================>] 334 --.-KB/s in 0s \n", + "\n", + "2021-03-17 09:25:32 (16.3 MB/s) - 'laboratoria.wmi.amu.edu.pl/vpn/index.html' saved [334/334]\n", + "\n", + "--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/a126\n", + "Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n", + "HTTP request sent, awaiting response... 301 Moved Permanently\n", + "Location: https://laboratoria.wmi.amu.edu.pl/a126/ [following]\n", + "--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/a126/\n", + "Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 3671 (3.6K) [text/html]\n", + "Saving to: 'laboratoria.wmi.amu.edu.pl/a126'\n", + "\n", + "laboratoria.wmi.amu 100%[===================>] 3.58K --.-KB/s in 0s \n", + "\n", + "2021-03-17 09:25:32 (194 MB/s) - 'laboratoria.wmi.amu.edu.pl/a126' saved [3671/3671]\n", + "\n", + "--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/irc\n", + "Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n", + "HTTP request sent, awaiting response... 301 Moved Permanently\n", + "Location: https://laboratoria.wmi.amu.edu.pl/irc/ [following]\n", + "--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/irc/\n", + "Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 3946 (3.9K) [text/html]\n", + "Saving to: 'laboratoria.wmi.amu.edu.pl/irc'\n", + "\n", + "laboratoria.wmi.amu 100%[===================>] 3.85K --.-KB/s in 0s \n", + "\n", + "2021-03-17 09:25:32 (243 MB/s) - 'laboratoria.wmi.amu.edu.pl/irc' saved [3946/3946]\n", + "\n", + "--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/godziny-otwarcia\n", + "Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n", + "HTTP request sent, awaiting response... 301 Moved Permanently\n", + "Location: https://laboratoria.wmi.amu.edu.pl/godziny-otwarcia/ [following]\n", + "--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/godziny-otwarcia/\n", + "Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 5317 (5.2K) [text/html]\n", + "laboratoria.wmi.amu.edu.pl/godziny-otwarcia: Is a directory\n", + "\n", + "Cannot write to 'laboratoria.wmi.amu.edu.pl/godziny-otwarcia' (Is a directory).\n", + "--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/js/fix.js\n", + "Connecting to laboratoria.wmi.amu.edu.pl (laboratoria.wmi.amu.edu.pl)|150.254.78.3|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 62 [application/javascript]\n", + "Saving to: 'laboratoria.wmi.amu.edu.pl/js/fix.js'\n", + "\n", + "laboratoria.wmi.amu 100%[===================>] 62 --.-KB/s in 0s \n", + "\n", + "2021-03-17 09:25:32 (6.51 MB/s) - 'laboratoria.wmi.amu.edu.pl/js/fix.js' saved [62/62]\n", + "\n", + "FINISHED --2021-03-17 09:25:32--\n", + "Total wall clock time: 0.3s\n", + "Downloaded: 20 files, 115K in 0.03s (4.14 MB/s)\n" + ] + } + ], + "source": [ + "# Pobierz rekurencyjnie, z ograniczeniem do jednego poziomu rekurencji \n", + "! wget -r -l 1 https://laboratoria.wmi.amu.edu.pl/" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "http://www.almanachmuszyny.pl/spisy/1991/AM1991_02_muszynski_zamek_prawda_i_legenda.pdf\n", + " out=1991-1.pdf\n", + "http://www.almanachmuszyny.pl/spisy/1991/AM1991_03_muszyna_miasteczko_historyczne.pdf\n", + " out=1991-2.pdf\n", + "\n", + "03/17 09:31:54 [\u001b[1;32mNOTICE\u001b[0m] Downloading 2 item(s)\n", + "\n", + "03/17 09:31:55 [\u001b[1;32mNOTICE\u001b[0m] Download complete: /home/filipg/ext/amu/aitech-eks/wyk/aria2c-example/1991-1.pdf\n", + "\n", + "03/17 09:31:55 [\u001b[1;32mNOTICE\u001b[0m] Download complete: /home/filipg/ext/amu/aitech-eks/wyk/aria2c-example/1991-2.pdf\n", + "\n", + "Download Results:\n", + "gid |stat|avg speed |path/URI\n", + "======+====+===========+=======================================================\n", + "3bf8a7|\u001b[1;32mOK\u001b[0m | 458KiB/s|/home/filipg/ext/amu/aitech-eks/wyk/aria2c-example/1991-1.pdf\n", + "e0c4c1|\u001b[1;32mOK\u001b[0m | 677KiB/s|/home/filipg/ext/amu/aitech-eks/wyk/aria2c-example/1991-2.pdf\n", + "\n", + "Status Legend:\n", + "(OK):download completed.\n" + ] + } + ], + "source": [ + "# aria2c pozwala łatwo pobrać listę adresów URL, dla każdego adresu można ustawić specyficzne opcje\n", + "! (cd aria2c-example && cat aria.in)\n", + "! (cd aria2c-example && aria2c -i aria.in)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Biblioteki/frameworki do tworzenia robotów\n", + "\n", + "### Python \n", + "\n", + "Użyteczne biblioteki: \n", + "\n", + "* urllib\n", + "* request\n", + "* Beautiful Soup (do parsowania HTML-a)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[('/en/', 'English'), ('/', '\\n\\n Laboratoria Komputerowe\\n '), ('/', 'Strona główna'), ('/godziny-otwarcia/', 'Godziny otwarcia'), ('/kontakt/', 'Kontakt'), ('/pierwsze-kroki/', 'Pierwsze kroki'), ('/przewodnik/', 'Przewodnik po stronie'), ('/regulamin-laboratoriow-komputerowych/', 'Regulamin Wydziałowych Laboratoriów Komputerowych'), ('/nie-odpowiadamy/', 'Za co nie odpowiadamy'), ('/laboratoria/oprogramowanie/', 'Laboratoria'), ('/uslugi/', 'Usługi'), ('/uslugi-uniwersyteckie/', 'Usługi Uniwersyteckie'), ('/problemy/docker/', 'Problemy'), ('/serwery-terminalowe/', 'serwera terminalowego'), ('/vpn/', 'VPN'), ('https://help.wmi.amu.edu.pl/', 'https://help.wmi.amu.edu.pl/'), ('/a126', 'A1-26'), ('https://help.wmi.amu.edu.pl/', 'System helpdeskowy'), ('mailto:helpdesk@wmi.amu.edu.pl', 'helpdesk@wmi.amu.edu.pl'), ('/irc', 'users'), ('https://www.facebook.com/wmilabs/', 'Facebook'), ('/godziny-otwarcia', 'Godziny otwarcia')]\n" + ] + } + ], + "source": [ + "import urllib\n", + "import requests\n", + "from bs4 import BeautifulSoup\n", + "\n", + "url = 'https://laboratoria.wmi.amu.edu.pl/'\n", + "response = requests.get(url)\n", + "soup = BeautifulSoup(response.content, \"html.parser\")\n", + "\n", + "# wydobądź wszystkie linki (elementy A)\n", + "links = soup.find_all('a')\n", + "print([(link['href'], link.get_text()) for link in links])\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## XPath\n", + "\n", + "XPath – język służący do adresowania części dokumentu XML.\n", + "\n", + "* `/html/body/div/p` – pełna ścieżka do wszystkich akapitów wewnątrz głównych elementów `