wko_anime-face-similarity/scrape_data.py

import requests
from bs4 import BeautifulSoup
from lxml import etree
import re

ROOT_URL = "https://myanimelist.net/character.php"

def get_page_xpath_result(url, xpath_str):
    HEADERS = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
    page = requests.get(url, headers=HEADERS)
    soup = BeautifulSoup(page.content, "html.parser")
    dom = etree.HTML(str(soup))
    return dom.xpath(xpath_str)

# 1. face image
# 2. character name
# 3. link

character_links = get_page_xpath_result(ROOT_URL, '//div[@class="information di-ib mt24"]/a/@href')
character_names = get_page_xpath_result(ROOT_URL, '//div[@class="information di-ib mt24"]/a')
character_names = [link.text for link in character_names]

print("character_links")
print(character_links)

print("character_names")
print(character_names)
scrap links and names 2023-01-15 12:03:55 +01:00			`import requests`
			`from bs4 import BeautifulSoup`
			`from lxml import etree`
			`import re`

			`ROOT_URL = "https://myanimelist.net/character.php"`

			`def get_page_xpath_result(url, xpath_str):`
			`HEADERS = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}`
			`page = requests.get(url, headers=HEADERS)`
			`soup = BeautifulSoup(page.content, "html.parser")`
			`dom = etree.HTML(str(soup))`
			`return dom.xpath(xpath_str)`

			`# 1. face image`
			`# 2. character name`
			`# 3. link`

			`character_links = get_page_xpath_result(ROOT_URL, '//div[@class="information di-ib mt24"]/a/@href')`
			`character_names = get_page_xpath_result(ROOT_URL, '//div[@class="information di-ib mt24"]/a')`
			`character_names = [link.text for link in character_names]`

			`print("character_links")`
			`print(character_links)`

			`print("character_names")`
			`print(character_names)`