Drugi wykład
This commit is contained in:
parent
fe29296e07
commit
5fc8388eef
@ -6,7 +6,7 @@
|
||||
"source": [
|
||||
"# Wyszukiwarki - wprowadzenie\n",
|
||||
"\n",
|
||||
"## Systemy wyszukiwania informacji\n",
|
||||
"## Systemy wyszukiwania informacji (information retrieval systems)\n",
|
||||
"\n",
|
||||
"![System wyszukiwania informacji](system-wyszukiwania-informacji.png)"
|
||||
]
|
||||
@ -800,6 +800,876 @@
|
||||
"* aplikacja pozwala wylistować wszystkie wyniki oznaczone do tej pory jako interesujące"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Czego nie brać?\n",
|
||||
"\n",
|
||||
"Standard robots.txt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"User-agent: *\n",
|
||||
"Disallow: /*/wyszukaj/\n",
|
||||
"Disallow: /*servlet\n",
|
||||
"Disallow: /reloadwww?\n",
|
||||
"Disallow: /dfptools/adview/\n",
|
||||
"Disallow: /pub/ips/*\n",
|
||||
"Disallow: /ods?\n",
|
||||
"Disallow: /getFile.servlet*\n",
|
||||
"Disallow: /aliasy/blad.jsp\n",
|
||||
"Disallow: /znajdz.do\n",
|
||||
"Disallow: /portalSearch.do\n",
|
||||
"Disallow: /im/ab/b4/10/z17515435Q.jpg\n",
|
||||
"Disallow: /75224259/\n",
|
||||
"\n",
|
||||
"User-agent: Googlebot-News\n",
|
||||
"Disallow: /nowy/\n",
|
||||
"Disallow: /mapa_strony\n",
|
||||
"Disallow: /*/wyszukaj/\n",
|
||||
"Disallow: /*/51,\n",
|
||||
"Disallow: /*/55,\n",
|
||||
"Disallow: /*/2,\n",
|
||||
"Disallow: /*order=\n",
|
||||
"Disallow: /*obxx=\n",
|
||||
"Disallow: /*tag=\n",
|
||||
"Disallow: /reloadwww?\n",
|
||||
"Disallow: /ods?\n",
|
||||
"Disallow: /*servlet\n",
|
||||
"Disallow: /dfptools/adview/\n",
|
||||
"\n",
|
||||
"User-agent: Yandex\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-Agent: bingbot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: 008\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: 010\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: 360Spider\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: 80legs\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Aboundex\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: accelobot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Add\\ Catalog\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: AhrefsBot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: aiHitBot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Alexibot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Aqua_Products\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: AskJeeves\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: asterias\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: awcheckBot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: b2w/0.1\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: BackDoorBot/1.0\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: BacklinkCrawler\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Baiduspider\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: BecomeBot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: BLEXBot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: BlowFish/1.0\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Bookmark search tool\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: BotALot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: brandwatch.net\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: BuiltBotTough\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Bullseye/1.0\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: BunnySlippers\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Butterfly\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: CatchBot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Charlotte\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: CheeseBot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: CherryPicker\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: CherryPickerElite/1.0\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: CherryPickerSE/1.0\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: CLIPish\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Cliqzbot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: COMODO\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Comodo-Certificates-Spider\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: CompSpyBot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Copernic\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: CopyRightCheck\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: cosmos\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: crawler\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Crescent\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Crescent Internet ToolPak HTTP OLE Control v.1.0\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Curious\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: curl\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: dataprovider\\.com\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: DinoPing\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: discoverybot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: DittoSpyder\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: DomainCrawler\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: DomainCrawler\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: dotbot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: dotnetdotcom\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Dow\\ Jones\\ Searchbot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: dumbot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: EasouSpider\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: EmailCollector\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: EmailSiphon\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: EmailWolf\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Enterprise_Search\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Enterprise_Search/1.0\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: EroCrawler\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: es\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Exabot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: ExtractorPro\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: EzineArticlesLinkScanner\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Ezooms\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: FairAd Client\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Flaming AttackBot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Foobot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: FreeFind\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: FTRF\\:\\ Friendly\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Gaisbot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: GetRight/4.2\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: gigabot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: grub\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: grub-client\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Harvest/1.5\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Hatena Antenna\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: hloader\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: http://www.SearchEngineWorld.com bot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: http://www.WebmasterWorld.com bot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: HTTP_Request\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: HTTP_Request2\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: httplib\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: humanlinks\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: ia_archiver\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: ia_archiver\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: ia_archiver/1.6\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Indy\\ Library\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: InfoNaviRobot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: ip\\-web\\-crawler\\.com\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Iron33/1.0.2\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Jakarta\\ Commons-HttpClient\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Jeeves\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: JennyBot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Jetbot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Jetbot/1.0\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: JikeSpider\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Kenjin Spider\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Keyword Density/0.9\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: larbin\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: LexiBot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: libWeb/clsHTTP\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: libwww-perl\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: lindex\\.com\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: linkdex\\.com\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: linkdexbot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: LinkextractorPro\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: LinkScan/8.1a Unix\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: LinkWalker\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: lipperhey\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: LNSpiderguy\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: looksmart\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: ltbot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: lwp-trivial\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: lwp-trivial/1.34\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Lynx\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: magpie\\-crawler\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Mata Hari\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Microsoft URL Control\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Microsoft URL Control - 5.01.4511\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Microsoft URL Control - 6.00.8169\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: MIIxpc\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: MIIxpc/4.2\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Mister PiX\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: MJ12bot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: moget\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: moget/2.1\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Mozilla/4.0 (compatible; BullsEye; Windows 95)\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: MSIE\\ or\\ Firefox\\ mutant\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: MSIECrawler\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: naver\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: NCBot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: NetAnts\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: NetcraftSurveyAgent\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: netEstate\\ NE\\ Crawler\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: NetMechanic\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Netseer\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: NextGenSearchBot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: NICErsPRO\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Nutch\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Nutch\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Ocelli\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Offline Explorer\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: OmniExplorer_Bot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Openbot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Openfind\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Openfind\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Openfind data gathere\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: OpenWebIndex\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Oracle Ultra Search\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: PagesInventory\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: PEAR\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: PeoplePal\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: PerMan\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: ProCogSEOBot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: ProPowerBot/2.14\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: ProWebWalker\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: proximic\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: psbot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: purebot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: QueryN Metasearch\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: QuerySeekerSpider\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Radiation Retriever 1.1\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: RepoMonkey\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: RepoMonkey Bait & Tackle/v1.01\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Riddler\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: RMA\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: rojerbot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: RyteBot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: scooter\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: ScoutJet\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Scrapy\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: ScreenerBot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: searchmetrics\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: searchpreview\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: SemrushBot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: sentibot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: SEO-CRAWLING\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: SEOENGWorldBot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: SEOkicks-Robot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: ShopWiki\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: sistrix\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: sitebot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: SiteSnagger\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Snoopy\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: SocialSearcher\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Sogou\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: SolomonoBot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: sootle\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Sosospider\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: SpankBot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: spanner\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: spbot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Speedy\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Stanford\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Stanford Comp Sci\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: SurveyBot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: suzuran\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Szukacz/1.4\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Szukacz/1.4\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Teleport\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: TeleportPro\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Telesoft\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Teoma\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: The Intraformant\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: The\\ Incutio\\ XML-RPC\\ PHP\\ Library\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: TheNomad\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: toCrawl/UrlDispatcher\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: True_Robot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: True_Robot/1.0\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: turingos\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: TurnitinBot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: uCrawler\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: URL Control\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: URL_Spider_Pro\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: URLy Warning\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: VCI\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: VCI WebViewer VCI WebViewer Win32\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: visaduhoc\\.info\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: WBSearchBot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Web Image Collector\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: WebAuto\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: WebBandit\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: WebBandit/3.50\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: WebCapture\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: WebCopier\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: WebEnhancer\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: WebInDetail\\.com\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: WebmasterWorld Extractor\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: WebmasterWorldForumBot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: WebSauger\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Website Quester\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: WEBSITEtheWEB\\.COM\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Webster Pro\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: WebStripper\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: WebVac\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: WebZip\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: WebZip/4.0\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Wget\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Wget/1.5.3\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Wget/1.6\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Wotbot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: www\\.integromedb\\.org\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: WWW-Collector-E\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Xenu's\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Xenu's Link Sleuth 1.1c\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: xpymep\\.exe\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: YamanaLab-Robot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: YisouSpider\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: YodaoBot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: YoudaoBot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Zend_Http_Client\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Zeus\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Zeus 32297 Webster Pro V2.9 Win32\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Zeus Link Scout\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: ZmEu\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: ZumBot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Linguee\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: sogou\n",
|
||||
"Disallow: /\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import urllib\n",
|
||||
"import requests\n",
|
||||
"\n",
|
||||
"url = 'https://gazeta.pl/robots.txt'\n",
|
||||
"response = requests.get(url)\n",
|
||||
"print(response.content.decode('utf-8'))\n",
|
||||
"\n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Projekt 2\n",
|
||||
"\n",
|
||||
"Opracować wyszukiwarkę plików robots.txt.\n",
|
||||
"\n",
|
||||
"* pobrać robots.txt dla (prawie) wszystkich polskich stron WWW\n",
|
||||
"* umożliwić wyszukiwanie i sortowanie według wszystkich możliwych pól (blokowana wyszukiwarka, adres, komentarz,\n",
|
||||
"długość pliku itd.)\n",
|
||||
"* opracować miary pozwalające automatycznie wyłuskać „ciekawe” pliki robots.txt (długość, występowanie pełnych\n",
|
||||
"linków, odmienność od innych plików robots.txt); umożliwić sortowanie/filtrowanie według tej miary"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
521
wyk/02_Wyszukiwarki-roboty.ipynb
Normal file
521
wyk/02_Wyszukiwarki-roboty.ipynb
Normal file
@ -0,0 +1,521 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Jak stworzyć swojego robota?"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Narzędzia uruchamiane z wiersza poleceń\n",
|
||||
"\n",
|
||||
"* wget\n",
|
||||
"* curl\n",
|
||||
"* aria2c"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/\n",
|
||||
"Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'\n",
|
||||
"Resolving laboratoria.wmi.amu.edu.pl (laboratoria.wmi.amu.edu.pl)... 150.254.78.3\n",
|
||||
"Connecting to laboratoria.wmi.amu.edu.pl (laboratoria.wmi.amu.edu.pl)|150.254.78.3|:443... connected.\n",
|
||||
"HTTP request sent, awaiting response... 200 OK\n",
|
||||
"Length: 6269 (6.1K) [text/html]\n",
|
||||
"Saving to: 'laboratoria.wmi.amu.edu.pl/index.html'\n",
|
||||
"\n",
|
||||
"laboratoria.wmi.amu 100%[===================>] 6.12K --.-KB/s in 0.001s \n",
|
||||
"\n",
|
||||
"2021-03-17 09:25:32 (4.19 MB/s) - 'laboratoria.wmi.amu.edu.pl/index.html' saved [6269/6269]\n",
|
||||
"\n",
|
||||
"Loading robots.txt; please ignore errors.\n",
|
||||
"--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/robots.txt\n",
|
||||
"Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n",
|
||||
"HTTP request sent, awaiting response... 403 Forbidden\n",
|
||||
"2021-03-17 09:25:32 ERROR 403: Forbidden.\n",
|
||||
"\n",
|
||||
"--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/page-resources/wmi.png\n",
|
||||
"Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n",
|
||||
"HTTP request sent, awaiting response... 200 OK\n",
|
||||
"Length: 596 [image/png]\n",
|
||||
"Saving to: 'laboratoria.wmi.amu.edu.pl/page-resources/wmi.png'\n",
|
||||
"\n",
|
||||
"laboratoria.wmi.amu 100%[===================>] 596 --.-KB/s in 0s \n",
|
||||
"\n",
|
||||
"2021-03-17 09:25:32 (53.7 MB/s) - 'laboratoria.wmi.amu.edu.pl/page-resources/wmi.png' saved [596/596]\n",
|
||||
"\n",
|
||||
"--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/css/labs.css\n",
|
||||
"Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n",
|
||||
"HTTP request sent, awaiting response... 200 OK\n",
|
||||
"Length: 6919 (6.8K) [text/css]\n",
|
||||
"Saving to: 'laboratoria.wmi.amu.edu.pl/css/labs.css'\n",
|
||||
"\n",
|
||||
"laboratoria.wmi.amu 100%[===================>] 6.76K --.-KB/s in 0s \n",
|
||||
"\n",
|
||||
"2021-03-17 09:25:32 (18.5 MB/s) - 'laboratoria.wmi.amu.edu.pl/css/labs.css' saved [6919/6919]\n",
|
||||
"\n",
|
||||
"--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/en/\n",
|
||||
"Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n",
|
||||
"HTTP request sent, awaiting response... 200 OK\n",
|
||||
"Length: 5946 (5.8K) [text/html]\n",
|
||||
"Saving to: 'laboratoria.wmi.amu.edu.pl/en/index.html'\n",
|
||||
"\n",
|
||||
"laboratoria.wmi.amu 100%[===================>] 5.81K --.-KB/s in 0.002s \n",
|
||||
"\n",
|
||||
"2021-03-17 09:25:32 (3.04 MB/s) - 'laboratoria.wmi.amu.edu.pl/en/index.html' saved [5946/5946]\n",
|
||||
"\n",
|
||||
"--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/page-resources/wmi_transparent.png\n",
|
||||
"Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n",
|
||||
"HTTP request sent, awaiting response... 200 OK\n",
|
||||
"Length: 15034 (15K) [image/png]\n",
|
||||
"Saving to: 'laboratoria.wmi.amu.edu.pl/page-resources/wmi_transparent.png'\n",
|
||||
"\n",
|
||||
"laboratoria.wmi.amu 100%[===================>] 14.68K --.-KB/s in 0.005s \n",
|
||||
"\n",
|
||||
"2021-03-17 09:25:32 (2.62 MB/s) - 'laboratoria.wmi.amu.edu.pl/page-resources/wmi_transparent.png' saved [15034/15034]\n",
|
||||
"\n",
|
||||
"--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/godziny-otwarcia/\n",
|
||||
"Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n",
|
||||
"HTTP request sent, awaiting response... 200 OK\n",
|
||||
"Length: 5317 (5.2K) [text/html]\n",
|
||||
"Saving to: 'laboratoria.wmi.amu.edu.pl/godziny-otwarcia/index.html'\n",
|
||||
"\n",
|
||||
"laboratoria.wmi.amu 100%[===================>] 5.19K --.-KB/s in 0s \n",
|
||||
"\n",
|
||||
"2021-03-17 09:25:32 (87.9 MB/s) - 'laboratoria.wmi.amu.edu.pl/godziny-otwarcia/index.html' saved [5317/5317]\n",
|
||||
"\n",
|
||||
"--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/kontakt/\n",
|
||||
"Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n",
|
||||
"HTTP request sent, awaiting response... 200 OK\n",
|
||||
"Length: 4644 (4.5K) [text/html]\n",
|
||||
"Saving to: 'laboratoria.wmi.amu.edu.pl/kontakt/index.html'\n",
|
||||
"\n",
|
||||
"laboratoria.wmi.amu 100%[===================>] 4.54K --.-KB/s in 0s \n",
|
||||
"\n",
|
||||
"2021-03-17 09:25:32 (142 MB/s) - 'laboratoria.wmi.amu.edu.pl/kontakt/index.html' saved [4644/4644]\n",
|
||||
"\n",
|
||||
"--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/pierwsze-kroki/\n",
|
||||
"Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n",
|
||||
"HTTP request sent, awaiting response... 200 OK\n",
|
||||
"Length: 6639 (6.5K) [text/html]\n",
|
||||
"Saving to: 'laboratoria.wmi.amu.edu.pl/pierwsze-kroki/index.html'\n",
|
||||
"\n",
|
||||
"laboratoria.wmi.amu 100%[===================>] 6.48K --.-KB/s in 0.002s \n",
|
||||
"\n",
|
||||
"2021-03-17 09:25:32 (3.61 MB/s) - 'laboratoria.wmi.amu.edu.pl/pierwsze-kroki/index.html' saved [6639/6639]\n",
|
||||
"\n",
|
||||
"--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/przewodnik/\n",
|
||||
"Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n",
|
||||
"HTTP request sent, awaiting response... 200 OK\n",
|
||||
"Length: 5454 (5.3K) [text/html]\n",
|
||||
"Saving to: 'laboratoria.wmi.amu.edu.pl/przewodnik/index.html'\n",
|
||||
"\n",
|
||||
"laboratoria.wmi.amu 100%[===================>] 5.33K --.-KB/s in 0.002s \n",
|
||||
"\n",
|
||||
"2021-03-17 09:25:32 (2.97 MB/s) - 'laboratoria.wmi.amu.edu.pl/przewodnik/index.html' saved [5454/5454]\n",
|
||||
"\n",
|
||||
"--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/regulamin-laboratoriow-komputerowych/\n",
|
||||
"Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n",
|
||||
"HTTP request sent, awaiting response... 200 OK\n",
|
||||
"Length: 14393 (14K) [text/html]\n",
|
||||
"Saving to: 'laboratoria.wmi.amu.edu.pl/regulamin-laboratoriow-komputerowych/index.html'\n",
|
||||
"\n",
|
||||
"laboratoria.wmi.amu 100%[===================>] 14.06K --.-KB/s in 0.005s \n",
|
||||
"\n",
|
||||
"2021-03-17 09:25:32 (2.65 MB/s) - 'laboratoria.wmi.amu.edu.pl/regulamin-laboratoriow-komputerowych/index.html' saved [14393/14393]\n",
|
||||
"\n",
|
||||
"--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/nie-odpowiadamy/\n",
|
||||
"Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n",
|
||||
"HTTP request sent, awaiting response... 200 OK\n",
|
||||
"Length: 4481 (4.4K) [text/html]\n",
|
||||
"Saving to: 'laboratoria.wmi.amu.edu.pl/nie-odpowiadamy/index.html'\n",
|
||||
"\n",
|
||||
"laboratoria.wmi.amu 100%[===================>] 4.38K --.-KB/s in 0s \n",
|
||||
"\n",
|
||||
"2021-03-17 09:25:32 (101 MB/s) - 'laboratoria.wmi.amu.edu.pl/nie-odpowiadamy/index.html' saved [4481/4481]\n",
|
||||
"\n",
|
||||
"--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/laboratoria/oprogramowanie/\n",
|
||||
"Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n",
|
||||
"HTTP request sent, awaiting response... 200 OK\n",
|
||||
"Length: 12821 (13K) [text/html]\n",
|
||||
"Saving to: 'laboratoria.wmi.amu.edu.pl/laboratoria/oprogramowanie/index.html'\n",
|
||||
"\n",
|
||||
"laboratoria.wmi.amu 100%[===================>] 12.52K --.-KB/s in 0.004s \n",
|
||||
"\n",
|
||||
"2021-03-17 09:25:32 (2.93 MB/s) - 'laboratoria.wmi.amu.edu.pl/laboratoria/oprogramowanie/index.html' saved [12821/12821]\n",
|
||||
"\n",
|
||||
"--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/uslugi/\n",
|
||||
"Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n",
|
||||
"HTTP request sent, awaiting response... 200 OK\n",
|
||||
"Length: 10688 (10K) [text/html]\n",
|
||||
"Saving to: 'laboratoria.wmi.amu.edu.pl/uslugi/index.html'\n",
|
||||
"\n",
|
||||
"laboratoria.wmi.amu 100%[===================>] 10.44K --.-KB/s in 0.004s \n",
|
||||
"\n",
|
||||
"2021-03-17 09:25:32 (2.74 MB/s) - 'laboratoria.wmi.amu.edu.pl/uslugi/index.html' saved [10688/10688]\n",
|
||||
"\n",
|
||||
"--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/uslugi-uniwersyteckie/\n",
|
||||
"Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n",
|
||||
"HTTP request sent, awaiting response... 200 OK\n",
|
||||
"Length: 4240 (4.1K) [text/html]\n",
|
||||
"Saving to: 'laboratoria.wmi.amu.edu.pl/uslugi-uniwersyteckie/index.html'\n",
|
||||
"\n",
|
||||
"laboratoria.wmi.amu 100%[===================>] 4.14K --.-KB/s in 0.001s \n",
|
||||
"\n",
|
||||
"2021-03-17 09:25:32 (3.27 MB/s) - 'laboratoria.wmi.amu.edu.pl/uslugi-uniwersyteckie/index.html' saved [4240/4240]\n",
|
||||
"\n",
|
||||
"--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/problemy/docker/\n",
|
||||
"Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n",
|
||||
"HTTP request sent, awaiting response... 200 OK\n",
|
||||
"Length: 6326 (6.2K) [text/html]\n",
|
||||
"Saving to: 'laboratoria.wmi.amu.edu.pl/problemy/docker/index.html'\n",
|
||||
"\n",
|
||||
"laboratoria.wmi.amu 100%[===================>] 6.18K --.-KB/s in 0s \n",
|
||||
"\n",
|
||||
"2021-03-17 09:25:32 (182 MB/s) - 'laboratoria.wmi.amu.edu.pl/problemy/docker/index.html' saved [6326/6326]\n",
|
||||
"\n",
|
||||
"--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/serwery-terminalowe/\n",
|
||||
"Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n",
|
||||
"HTTP request sent, awaiting response... 200 OK\n",
|
||||
"Length: 382 [text/html]\n",
|
||||
"Saving to: 'laboratoria.wmi.amu.edu.pl/serwery-terminalowe/index.html'\n",
|
||||
"\n",
|
||||
"laboratoria.wmi.amu 100%[===================>] 382 --.-KB/s in 0s \n",
|
||||
"\n",
|
||||
"2021-03-17 09:25:32 (15.9 MB/s) - 'laboratoria.wmi.amu.edu.pl/serwery-terminalowe/index.html' saved [382/382]\n",
|
||||
"\n",
|
||||
"--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/vpn/\n",
|
||||
"Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n",
|
||||
"HTTP request sent, awaiting response... 200 OK\n",
|
||||
"Length: 334 [text/html]\n",
|
||||
"Saving to: 'laboratoria.wmi.amu.edu.pl/vpn/index.html'\n",
|
||||
"\n",
|
||||
"laboratoria.wmi.amu 100%[===================>] 334 --.-KB/s in 0s \n",
|
||||
"\n",
|
||||
"2021-03-17 09:25:32 (16.3 MB/s) - 'laboratoria.wmi.amu.edu.pl/vpn/index.html' saved [334/334]\n",
|
||||
"\n",
|
||||
"--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/a126\n",
|
||||
"Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n",
|
||||
"HTTP request sent, awaiting response... 301 Moved Permanently\n",
|
||||
"Location: https://laboratoria.wmi.amu.edu.pl/a126/ [following]\n",
|
||||
"--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/a126/\n",
|
||||
"Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n",
|
||||
"HTTP request sent, awaiting response... 200 OK\n",
|
||||
"Length: 3671 (3.6K) [text/html]\n",
|
||||
"Saving to: 'laboratoria.wmi.amu.edu.pl/a126'\n",
|
||||
"\n",
|
||||
"laboratoria.wmi.amu 100%[===================>] 3.58K --.-KB/s in 0s \n",
|
||||
"\n",
|
||||
"2021-03-17 09:25:32 (194 MB/s) - 'laboratoria.wmi.amu.edu.pl/a126' saved [3671/3671]\n",
|
||||
"\n",
|
||||
"--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/irc\n",
|
||||
"Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n",
|
||||
"HTTP request sent, awaiting response... 301 Moved Permanently\n",
|
||||
"Location: https://laboratoria.wmi.amu.edu.pl/irc/ [following]\n",
|
||||
"--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/irc/\n",
|
||||
"Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n",
|
||||
"HTTP request sent, awaiting response... 200 OK\n",
|
||||
"Length: 3946 (3.9K) [text/html]\n",
|
||||
"Saving to: 'laboratoria.wmi.amu.edu.pl/irc'\n",
|
||||
"\n",
|
||||
"laboratoria.wmi.amu 100%[===================>] 3.85K --.-KB/s in 0s \n",
|
||||
"\n",
|
||||
"2021-03-17 09:25:32 (243 MB/s) - 'laboratoria.wmi.amu.edu.pl/irc' saved [3946/3946]\n",
|
||||
"\n",
|
||||
"--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/godziny-otwarcia\n",
|
||||
"Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n",
|
||||
"HTTP request sent, awaiting response... 301 Moved Permanently\n",
|
||||
"Location: https://laboratoria.wmi.amu.edu.pl/godziny-otwarcia/ [following]\n",
|
||||
"--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/godziny-otwarcia/\n",
|
||||
"Reusing existing connection to laboratoria.wmi.amu.edu.pl:443.\n",
|
||||
"HTTP request sent, awaiting response... 200 OK\n",
|
||||
"Length: 5317 (5.2K) [text/html]\n",
|
||||
"laboratoria.wmi.amu.edu.pl/godziny-otwarcia: Is a directory\n",
|
||||
"\n",
|
||||
"Cannot write to 'laboratoria.wmi.amu.edu.pl/godziny-otwarcia' (Is a directory).\n",
|
||||
"--2021-03-17 09:25:32-- https://laboratoria.wmi.amu.edu.pl/js/fix.js\n",
|
||||
"Connecting to laboratoria.wmi.amu.edu.pl (laboratoria.wmi.amu.edu.pl)|150.254.78.3|:443... connected.\n",
|
||||
"HTTP request sent, awaiting response... 200 OK\n",
|
||||
"Length: 62 [application/javascript]\n",
|
||||
"Saving to: 'laboratoria.wmi.amu.edu.pl/js/fix.js'\n",
|
||||
"\n",
|
||||
"laboratoria.wmi.amu 100%[===================>] 62 --.-KB/s in 0s \n",
|
||||
"\n",
|
||||
"2021-03-17 09:25:32 (6.51 MB/s) - 'laboratoria.wmi.amu.edu.pl/js/fix.js' saved [62/62]\n",
|
||||
"\n",
|
||||
"FINISHED --2021-03-17 09:25:32--\n",
|
||||
"Total wall clock time: 0.3s\n",
|
||||
"Downloaded: 20 files, 115K in 0.03s (4.14 MB/s)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Pobierz rekurencyjnie, z ograniczeniem do jednego poziomu rekurencji \n",
|
||||
"! wget -r -l 1 https://laboratoria.wmi.amu.edu.pl/"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"http://www.almanachmuszyny.pl/spisy/1991/AM1991_02_muszynski_zamek_prawda_i_legenda.pdf\n",
|
||||
" out=1991-1.pdf\n",
|
||||
"http://www.almanachmuszyny.pl/spisy/1991/AM1991_03_muszyna_miasteczko_historyczne.pdf\n",
|
||||
" out=1991-2.pdf\n",
|
||||
"\n",
|
||||
"03/17 09:31:54 [\u001b[1;32mNOTICE\u001b[0m] Downloading 2 item(s)\n",
|
||||
"\n",
|
||||
"03/17 09:31:55 [\u001b[1;32mNOTICE\u001b[0m] Download complete: /home/filipg/ext/amu/aitech-eks/wyk/aria2c-example/1991-1.pdf\n",
|
||||
"\n",
|
||||
"03/17 09:31:55 [\u001b[1;32mNOTICE\u001b[0m] Download complete: /home/filipg/ext/amu/aitech-eks/wyk/aria2c-example/1991-2.pdf\n",
|
||||
"\n",
|
||||
"Download Results:\n",
|
||||
"gid |stat|avg speed |path/URI\n",
|
||||
"======+====+===========+=======================================================\n",
|
||||
"3bf8a7|\u001b[1;32mOK\u001b[0m | 458KiB/s|/home/filipg/ext/amu/aitech-eks/wyk/aria2c-example/1991-1.pdf\n",
|
||||
"e0c4c1|\u001b[1;32mOK\u001b[0m | 677KiB/s|/home/filipg/ext/amu/aitech-eks/wyk/aria2c-example/1991-2.pdf\n",
|
||||
"\n",
|
||||
"Status Legend:\n",
|
||||
"(OK):download completed.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# aria2c pozwala łatwo pobrać listę adresów URL, dla każdego adresu można ustawić specyficzne opcje\n",
|
||||
"! (cd aria2c-example && cat aria.in)\n",
|
||||
"! (cd aria2c-example && aria2c -i aria.in)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Biblioteki/frameworki do tworzenia robotów\n",
|
||||
"\n",
|
||||
"### Python \n",
|
||||
"\n",
|
||||
"Użyteczne biblioteki: \n",
|
||||
"\n",
|
||||
"* urllib\n",
|
||||
"* request\n",
|
||||
"* Beautiful Soup (do parsowania HTML-a)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[('/en/', 'English'), ('/', '\\n\\n Laboratoria Komputerowe\\n '), ('/', 'Strona główna'), ('/godziny-otwarcia/', 'Godziny otwarcia'), ('/kontakt/', 'Kontakt'), ('/pierwsze-kroki/', 'Pierwsze kroki'), ('/przewodnik/', 'Przewodnik po stronie'), ('/regulamin-laboratoriow-komputerowych/', 'Regulamin Wydziałowych Laboratoriów Komputerowych'), ('/nie-odpowiadamy/', 'Za co nie odpowiadamy'), ('/laboratoria/oprogramowanie/', 'Laboratoria'), ('/uslugi/', 'Usługi'), ('/uslugi-uniwersyteckie/', 'Usługi Uniwersyteckie'), ('/problemy/docker/', 'Problemy'), ('/serwery-terminalowe/', 'serwera terminalowego'), ('/vpn/', 'VPN'), ('https://help.wmi.amu.edu.pl/', 'https://help.wmi.amu.edu.pl/'), ('/a126', 'A1-26'), ('https://help.wmi.amu.edu.pl/', 'System helpdeskowy'), ('mailto:helpdesk@wmi.amu.edu.pl', 'helpdesk@wmi.amu.edu.pl'), ('/irc', 'users'), ('https://www.facebook.com/wmilabs/', 'Facebook'), ('/godziny-otwarcia', 'Godziny otwarcia')]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import urllib\n",
|
||||
"import requests\n",
|
||||
"from bs4 import BeautifulSoup\n",
|
||||
"\n",
|
||||
"url = 'https://laboratoria.wmi.amu.edu.pl/'\n",
|
||||
"response = requests.get(url)\n",
|
||||
"soup = BeautifulSoup(response.content, \"html.parser\")\n",
|
||||
"\n",
|
||||
"# wydobądź wszystkie linki (elementy A)\n",
|
||||
"links = soup.find_all('a')\n",
|
||||
"print([(link['href'], link.get_text()) for link in links])\n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## XPath\n",
|
||||
"\n",
|
||||
"XPath – język służący do adresowania części dokumentu XML.\n",
|
||||
"\n",
|
||||
"* `/html/body/div/p` – pełna ścieżka do wszystkich akapitów wewnątrz głównych elementów `<DIV>`\n",
|
||||
"* `//div/p` – wszystkie akapity w jakichkolwiek elementach `<DIV>`\n",
|
||||
"* `//a/@href` - wartości atrybutu `href` dla wszystkich linków\n",
|
||||
"* `//p[@id=’foo’]/img[5]` - piąty (indeksowanie od 1!) obrazek wewnątrz akapitu o identyfikatorze foo\n",
|
||||
"* `//p[img]/a` - linki w akapitach zawierających obrazek\n",
|
||||
"\n",
|
||||
"Czym się różni:\n",
|
||||
"\n",
|
||||
"* `//img[3]` od `(//img)[3]` ?\n",
|
||||
"* `//p[img]/a` od `//p[//img]/a` ?\n",
|
||||
"\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"['/', '/godziny-otwarcia/', '/kontakt/', '/pierwsze-kroki/', '/przewodnik/', '/regulamin-laboratoriow-komputerowych/', '/nie-odpowiadamy/', '/laboratoria/oprogramowanie/', '/uslugi/', '/uslugi-uniwersyteckie/', '/problemy/docker/']\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"\n",
|
||||
"from urllib.request import urlopen\n",
|
||||
"from lxml import etree\n",
|
||||
"\n",
|
||||
"url = 'https://laboratoria.wmi.amu.edu.pl/'\n",
|
||||
"\n",
|
||||
"response = urlopen(url)\n",
|
||||
"htmlparser = etree.HTMLParser()\n",
|
||||
"tree = etree.parse(response, htmlparser)\n",
|
||||
"# linki z panelu\n",
|
||||
"links = tree.xpath(\"//div[@class='sidebar-menu']//a/@href\")\n",
|
||||
"print(links)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"## Jak poradzić sobie z dynamicznymi stronami?"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### HtmlUnit\n",
|
||||
"\n",
|
||||
"```\n",
|
||||
"WebClient webClient = new WebClient();\n",
|
||||
"HtmlPage page = webClient.getPage(\"http://ceti.pl/?ceti=administracja\");\n",
|
||||
"\n",
|
||||
"HtmlForm form = page.getForms().get(2);\n",
|
||||
"\n",
|
||||
"HtmlTextInput loginField = form.getInputByName(\"login\");\n",
|
||||
"loginField.setValueAttribute(\"atrapa\");\n",
|
||||
"HtmlPasswordInput passField = form.getInputByName(\"pass\");\n",
|
||||
"passField.setValueAttribute(\"haslo1\");\n",
|
||||
"\n",
|
||||
"HtmlImageInput button = form.getInputByValue(\"OK\");\n",
|
||||
"HtmlPage page2 = (HtmlPage)button.click();\n",
|
||||
"\n",
|
||||
"HtmlPage page3 = webClient.getPage(\"https://tau4.ceti.pl/cgi-bin/logs-user-show.cgi\");\n",
|
||||
"System.out.println(page3.asXml());\n",
|
||||
"\n",
|
||||
"UnexpectedPage page4 = webClient.getPage(\"https://adm.tau4.ceti.pl/logs.zip\");\n",
|
||||
"InputStream istr = page4.getInputStream();\n",
|
||||
"``` \n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"## Selenium"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 37,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"['https://www.python.org/community/sigs/guidelines', 'https://www.python.org/dev/peps/pep-0585/', 'https://www.python.org/community/lists', 'https://www.python.org/doc/essays/list2str', 'https://www.python.org/dev/core-mentorship', 'https://www.python.org/dev/peps/pep-3128/', 'https://www.python.org/dev/peps/pep-0204/', 'https://www.python.org/community/sigs/coordination', 'https://www.python.org/psf/committees', 'https://www.python.org/dev/peps/pep-0225/', 'https://www.python.org/dev/peps/pep-3132/', 'https://www.python.org/community/sigs/current/doc-sig/stext', 'https://www.python.org/dev/peps/pep-0202/', 'https://www.python.org/dev/peps/pep-0274/', 'https://www.python.org/dev/peps/pep-0469/', 'https://www.python.org/dev/peps/pep-0289/', 'https://www.python.org/dev/peps/pep-0270/', 'https://www.python.org/community/sigs/retired/string-sig', 'https://www.python.org/community/sigs/retired/progenv-sig', 'https://www.python.org/psf/records/board/minutes/2005-02-08']\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# należy wcześniej uruchomić serwer selenium\n",
|
||||
"# wget https://selenium-release.storage.googleapis.com/3.141/selenium-server-standalone-3.141.59.jar\n",
|
||||
"# java -jar selenium-server-standalone-3.141.59.jar\n",
|
||||
"\n",
|
||||
"from selenium import webdriver\n",
|
||||
"from selenium.webdriver.common.desired_capabilities import DesiredCapabilities\n",
|
||||
"from selenium.webdriver.common.keys import Keys\n",
|
||||
"from selenium.webdriver.common.by import By\n",
|
||||
"\n",
|
||||
"driver = webdriver.Remote(\n",
|
||||
" command_executor='http://127.0.0.1:4444/wd/hub',\n",
|
||||
" desired_capabilities=DesiredCapabilities.CHROME)\n",
|
||||
"\n",
|
||||
"driver.get(\"http://www.python.org\")\n",
|
||||
"assert \"Python\" in driver.title\n",
|
||||
"elem = driver.find_element_by_name(\"q\")\n",
|
||||
"elem.clear()\n",
|
||||
"elem.send_keys(\"list\")\n",
|
||||
"elem.send_keys(Keys.RETURN)\n",
|
||||
"links = driver.find_elements(By.XPATH, '//h3/a')\n",
|
||||
"print([l.get_attribute('href') for l in links])\n",
|
||||
"driver.close()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Haskell i strzałki\n",
|
||||
"\n",
|
||||
"W języku Haskell można tworzyć roboty używając biblioteki HXT opartym na formalizmie strzałek (ang. _arrows_).\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
4
wyk/aria2c-example/aria.in
Normal file
4
wyk/aria2c-example/aria.in
Normal file
@ -0,0 +1,4 @@
|
||||
http://www.almanachmuszyny.pl/spisy/1991/AM1991_02_muszynski_zamek_prawda_i_legenda.pdf
|
||||
out=1991-1.pdf
|
||||
http://www.almanachmuszyny.pl/spisy/1991/AM1991_03_muszyna_miasteczko_historyczne.pdf
|
||||
out=1991-2.pdf
|
Loading…
Reference in New Issue
Block a user