diff --git a/README.md b/README.md index 8952594..c548d4b 100644 --- a/README.md +++ b/README.md @@ -4,71 +4,4 @@ This project is a web scraper designed to extract data from websites. -## Features -☑️ Extracts data from web pages - -## Usage - -### With Docker - -1. Clone the repository: - -```bash -git clone https://git.wmi.amu.edu.pl/s500042/webscraper -``` - -2. Navigate to the project directory: - -```bash -cd webscraper -``` - -3. Build the Docker image and run it using `start.py` script: - -```bash -python scripts/start.py -``` - -On Mac, you'll have to use - -```bash -python3 scripts/start.py -``` - -4. Check `/app/dist/data.json` file to see the extracted data. - -### Without Docker - -1. Clone the repository: - -```bash -git clone https://git.wmi.amu.edu.pl/s500042/webscraper -``` - -2. Install the required dependencies: - -```bash -pip install -r app/requirements.txt -``` - -If you're on Arch Linux, you'll need to create a virtual environment. -Here's is a [Step by step guide](#) that will help you create it. - -3. Run `run_with_no_docker.py` script: - -```bash -python scripts/run_with_no_docker.py -``` - -On Mac you'll, need to use: - -```bash -python3 scripts/run_with_no_docker.py -``` - -4. Check `/app/dist/data.json` file to see the extracted data. - -## License - -This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details. diff --git a/app/scripts/noguchi.py b/app/scripts/noguchi.py index 1af7ff5..1ae2e54 100644 --- a/app/scripts/noguchi.py +++ b/app/scripts/noguchi.py @@ -1,7 +1,4 @@ import time -import requests -import os -import json from playwright.async_api import async_playwright import asyncio @@ -11,6 +8,7 @@ NOTE: Some pages doesn'y have info about paintings, so we need to skip them """ + class NoguchiScraper: def __init__(self, url="https://archive.noguchi.org/Browse/CR", base_url="https://archive.noguchi.org"): self.hrefs = [] @@ -36,9 +34,6 @@ class NoguchiScraper: element = await self.find_el('a.acceptCookie') await element.click() - async def insert_value(self, selector, value): - await self.page.fill(selector, value) - async def find_el(self, selector: str): await self.wait_for_el(selector) return await self.page.query_selector(selector)