feat: seperate all logic into small methods
Some checks are pending
Docker Image CI / build (push) Waiting to run
Some checks are pending
Docker Image CI / build (push) Waiting to run
This commit is contained in:
parent
a451409fa6
commit
f65292d891
@ -3,24 +3,13 @@ FROM python:3.9-slim
|
||||
WORKDIR /usr/src/app
|
||||
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
RUN pip install --trusted-host pypi.python.org -r requirements.txt
|
||||
|
||||
COPY . .
|
||||
|
||||
RUN apt-get update && apt-get install -y \
|
||||
wget \
|
||||
unzip \
|
||||
curl \
|
||||
libx11-dev \
|
||||
libgdk-pixbuf2.0-0 \
|
||||
libcanberra-gtk-module \
|
||||
&& wget https://chromedriver.storage.googleapis.com/114.0.5735.90/chromedriver_linux64.zip \
|
||||
&& unzip chromedriver_linux64.zip \
|
||||
&& mv chromedriver /usr/local/bin/ \
|
||||
&& chmod +x /usr/local/bin/chromedriver
|
||||
RUN apt-get update && apt-get install -y wget unzip && \
|
||||
wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb && \
|
||||
apt install -y ./google-chrome-stable_current_amd64.deb && \
|
||||
rm ./google-chrome-stable_current_amd64.deb && \
|
||||
apt-get clean
|
||||
|
||||
RUN useradd python
|
||||
RUN chown -R python /usr/src/app
|
||||
USER python
|
||||
|
||||
CMD ["python", "main.py"]
|
24
app/main.py
24
app/main.py
@ -1,12 +1,26 @@
|
||||
from scraper import scraper
|
||||
from scraper import scrap
|
||||
import os
|
||||
import json
|
||||
import time
|
||||
|
||||
urls = ["https://digitalprojects.wpi.art/monet/artworks"]
|
||||
hrefs = []
|
||||
|
||||
|
||||
def main():
|
||||
print("Starting the application...\n\n")
|
||||
scraper()
|
||||
print("\n\nApplication finished!")
|
||||
time.sleep(8)
|
||||
directory = "dist"
|
||||
file_path = os.path.join(directory, "data.json")
|
||||
scrap(urls[0])
|
||||
|
||||
data = []
|
||||
|
||||
try:
|
||||
os.mkdir("dist")
|
||||
except FileExistsError:
|
||||
pass
|
||||
with open(file_path, "w", encoding="utf-8") as file:
|
||||
json.dump(data, file)
|
||||
print("Data has been scraped!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -1,8 +1,24 @@
|
||||
attrs==24.2.0
|
||||
beautifulsoup4==4.12.3
|
||||
bs4==0.0.2
|
||||
certifi==2024.8.30
|
||||
charset-normalizer==3.4.0
|
||||
h11==0.14.0
|
||||
idna==3.10
|
||||
lxml==5.3.0
|
||||
outcome==1.3.0.post0
|
||||
packaging==24.2
|
||||
PySocks==1.7.1
|
||||
python-dotenv==1.0.1
|
||||
requests==2.32.3
|
||||
selenium==4.26.1
|
||||
sniffio==1.3.1
|
||||
sortedcontainers==2.4.0
|
||||
soupsieve==2.6
|
||||
trio==0.27.0
|
||||
trio-websocket==0.11.1
|
||||
typing_extensions==4.12.2
|
||||
urllib3==2.2.3
|
||||
webdriver-manager==4.0.2
|
||||
websocket-client==1.8.0
|
||||
wsproto==1.2.0
|
||||
|
@ -1,17 +1,78 @@
|
||||
import os
|
||||
import json
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.service import Service
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from webdriver_manager.chrome import ChromeDriverManager
|
||||
import time
|
||||
|
||||
|
||||
def scraper():
|
||||
directory = "dist"
|
||||
file_path = os.path.join(directory, "data.json")
|
||||
class Scraper:
|
||||
def __init__(self, url):
|
||||
self.url = url
|
||||
self.hrefs = []
|
||||
self.driver = self.load_driver()
|
||||
|
||||
data = []
|
||||
def load_driver(self) -> webdriver.Chrome:
|
||||
options = webdriver.ChromeOptions()
|
||||
options.add_argument("--headless")
|
||||
options.add_argument("--no-sandbox")
|
||||
|
||||
try:
|
||||
os.mkdir("dist")
|
||||
except FileExistsError:
|
||||
pass
|
||||
with open(file_path, "w", encoding="utf-8") as file:
|
||||
json.dump(data, file)
|
||||
print("Data has been scraped!")
|
||||
return webdriver.Chrome(
|
||||
options=options,
|
||||
service=(
|
||||
Service(ChromeDriverManager().install())
|
||||
if os.path.exists("/.dockerenv")
|
||||
else None
|
||||
),
|
||||
)
|
||||
|
||||
def skip_cookies(self) -> None:
|
||||
WebDriverWait(self.driver, 5).until(
|
||||
EC.presence_of_element_located(
|
||||
(By.CSS_SELECTOR, "[_ngcontent-ng-c745257238]")
|
||||
)
|
||||
)
|
||||
|
||||
button = self.driver.find_element(By.CSS_SELECTOR, "[_ngcontent-ng-c745257238]")
|
||||
self.driver.execute_script(
|
||||
"""
|
||||
arguments[0].removeAttribute('disabled');
|
||||
arguments[0].className = 'border-button';
|
||||
""",
|
||||
button,
|
||||
)
|
||||
button.click()
|
||||
time.sleep(2)
|
||||
|
||||
def load_page(self) -> None:
|
||||
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
||||
time.sleep(2)
|
||||
|
||||
def locate_valid_artworks(self) -> list[str]:
|
||||
WebDriverWait(self.driver, 5).until(
|
||||
EC.presence_of_element_located((By.CSS_SELECTOR, ".artwork-search-results"))
|
||||
)
|
||||
artworks = self.driver.find_elements(
|
||||
By.CSS_SELECTOR, ".artwork-search-results article"
|
||||
)
|
||||
for artwork in artworks:
|
||||
href = artwork.find_element(By.CSS_SELECTOR, "a").get_attribute("href")
|
||||
self.hrefs.append(href)
|
||||
return self.hrefs
|
||||
|
||||
|
||||
def scrap(url: str):
|
||||
instance = Scraper(url)
|
||||
driver = instance.driver
|
||||
driver.get(url)
|
||||
|
||||
instance.skip_cookies()
|
||||
instance.load_page()
|
||||
hrefs = instance.locate_valid_artworks()
|
||||
|
||||
print(hrefs)
|
||||
html = driver.page_source
|
||||
driver.quit()
|
||||
return html
|
||||
|
@ -3,8 +3,6 @@ services:
|
||||
build:
|
||||
context: ./app
|
||||
container_name: webscraper
|
||||
depends_on:
|
||||
- redis
|
||||
volumes:
|
||||
- ./app:/usr/src/app
|
||||
develop:
|
||||
@ -13,14 +11,20 @@ services:
|
||||
action: rebuild
|
||||
- path: ./app
|
||||
target: /usr/src/app
|
||||
action: sync
|
||||
redis:
|
||||
image: "redis:alpine"
|
||||
volumes:
|
||||
- redis_data:/data
|
||||
action: sync+restart
|
||||
command: tail -f /dev/null
|
||||
selenium-hub:
|
||||
image: "selenium/hub:3.141.59"
|
||||
container_name: selenium-hub
|
||||
ports:
|
||||
- "6379:6379"
|
||||
- "4444:4444"
|
||||
# redis:
|
||||
# image: "redis:alpine"
|
||||
# volumes:
|
||||
# - redis_data:/data
|
||||
# ports:
|
||||
# - "6379:6379"
|
||||
|
||||
volumes:
|
||||
redis_data:
|
||||
# redis_data:
|
||||
app:
|
||||
|
Loading…
Reference in New Issue
Block a user