feat: seperate all logic into small methods
Some checks are pending
Docker Image CI / build (push) Waiting to run

This commit is contained in:
patilk 2024-11-13 23:56:43 +01:00
parent a451409fa6
commit f65292d891
Signed by: s500042
GPG Key ID: 1921AD722E7392EE
6 changed files with 128 additions and 44 deletions

View File

@ -3,24 +3,13 @@ FROM python:3.9-slim
WORKDIR /usr/src/app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
RUN pip install --trusted-host pypi.python.org -r requirements.txt
COPY . .
RUN apt-get update && apt-get install -y \
wget \
unzip \
curl \
libx11-dev \
libgdk-pixbuf2.0-0 \
libcanberra-gtk-module \
&& wget https://chromedriver.storage.googleapis.com/114.0.5735.90/chromedriver_linux64.zip \
&& unzip chromedriver_linux64.zip \
&& mv chromedriver /usr/local/bin/ \
&& chmod +x /usr/local/bin/chromedriver
RUN apt-get update && apt-get install -y wget unzip && \
wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb && \
apt install -y ./google-chrome-stable_current_amd64.deb && \
rm ./google-chrome-stable_current_amd64.deb && \
apt-get clean
RUN useradd python
RUN chown -R python /usr/src/app
USER python
CMD ["python", "main.py"]

View File

@ -1,12 +1,26 @@
from scraper import scraper
from scraper import scrap
import os
import json
import time
urls = ["https://digitalprojects.wpi.art/monet/artworks"]
hrefs = []
def main():
print("Starting the application...\n\n")
scraper()
print("\n\nApplication finished!")
time.sleep(8)
directory = "dist"
file_path = os.path.join(directory, "data.json")
scrap(urls[0])
data = []
try:
os.mkdir("dist")
except FileExistsError:
pass
with open(file_path, "w", encoding="utf-8") as file:
json.dump(data, file)
print("Data has been scraped!")
if __name__ == "__main__":

View File

@ -1,8 +1,24 @@
attrs==24.2.0
beautifulsoup4==4.12.3
bs4==0.0.2
certifi==2024.8.30
charset-normalizer==3.4.0
h11==0.14.0
idna==3.10
lxml==5.3.0
outcome==1.3.0.post0
packaging==24.2
PySocks==1.7.1
python-dotenv==1.0.1
requests==2.32.3
selenium==4.26.1
sniffio==1.3.1
sortedcontainers==2.4.0
soupsieve==2.6
trio==0.27.0
trio-websocket==0.11.1
typing_extensions==4.12.2
urllib3==2.2.3
webdriver-manager==4.0.2
websocket-client==1.8.0
wsproto==1.2.0

View File

@ -1,17 +1,78 @@
import os
import json
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import time
def scraper():
directory = "dist"
file_path = os.path.join(directory, "data.json")
class Scraper:
def __init__(self, url):
self.url = url
self.hrefs = []
self.driver = self.load_driver()
data = []
def load_driver(self) -> webdriver.Chrome:
options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
try:
os.mkdir("dist")
except FileExistsError:
pass
with open(file_path, "w", encoding="utf-8") as file:
json.dump(data, file)
print("Data has been scraped!")
return webdriver.Chrome(
options=options,
service=(
Service(ChromeDriverManager().install())
if os.path.exists("/.dockerenv")
else None
),
)
def skip_cookies(self) -> None:
WebDriverWait(self.driver, 5).until(
EC.presence_of_element_located(
(By.CSS_SELECTOR, "[_ngcontent-ng-c745257238]")
)
)
button = self.driver.find_element(By.CSS_SELECTOR, "[_ngcontent-ng-c745257238]")
self.driver.execute_script(
"""
arguments[0].removeAttribute('disabled');
arguments[0].className = 'border-button';
""",
button,
)
button.click()
time.sleep(2)
def load_page(self) -> None:
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
def locate_valid_artworks(self) -> list[str]:
WebDriverWait(self.driver, 5).until(
EC.presence_of_element_located((By.CSS_SELECTOR, ".artwork-search-results"))
)
artworks = self.driver.find_elements(
By.CSS_SELECTOR, ".artwork-search-results article"
)
for artwork in artworks:
href = artwork.find_element(By.CSS_SELECTOR, "a").get_attribute("href")
self.hrefs.append(href)
return self.hrefs
def scrap(url: str):
instance = Scraper(url)
driver = instance.driver
driver.get(url)
instance.skip_cookies()
instance.load_page()
hrefs = instance.locate_valid_artworks()
print(hrefs)
html = driver.page_source
driver.quit()
return html

View File

@ -3,8 +3,6 @@ services:
build:
context: ./app
container_name: webscraper
depends_on:
- redis
volumes:
- ./app:/usr/src/app
develop:
@ -13,14 +11,20 @@ services:
action: rebuild
- path: ./app
target: /usr/src/app
action: sync
redis:
image: "redis:alpine"
volumes:
- redis_data:/data
action: sync+restart
command: tail -f /dev/null
selenium-hub:
image: "selenium/hub:3.141.59"
container_name: selenium-hub
ports:
- "6379:6379"
- "4444:4444"
# redis:
# image: "redis:alpine"
# volumes:
# - redis_data:/data
# ports:
# - "6379:6379"
volumes:
redis_data:
# redis_data:
app:

View File

@ -4,4 +4,4 @@ docker compose up -d
docker compose wait webscraper > /dev/null
docker compose down
# docker compose down