feat: seperate all logic into small methods
Some checks are pending
Docker Image CI / build (push) Waiting to run
Some checks are pending
Docker Image CI / build (push) Waiting to run
This commit is contained in:
parent
a451409fa6
commit
f65292d891
@ -3,24 +3,13 @@ FROM python:3.9-slim
|
|||||||
WORKDIR /usr/src/app
|
WORKDIR /usr/src/app
|
||||||
|
|
||||||
COPY requirements.txt .
|
COPY requirements.txt .
|
||||||
RUN pip install --no-cache-dir -r requirements.txt
|
RUN pip install --trusted-host pypi.python.org -r requirements.txt
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
|
|
||||||
RUN apt-get update && apt-get install -y \
|
RUN apt-get update && apt-get install -y wget unzip && \
|
||||||
wget \
|
wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb && \
|
||||||
unzip \
|
apt install -y ./google-chrome-stable_current_amd64.deb && \
|
||||||
curl \
|
rm ./google-chrome-stable_current_amd64.deb && \
|
||||||
libx11-dev \
|
apt-get clean
|
||||||
libgdk-pixbuf2.0-0 \
|
|
||||||
libcanberra-gtk-module \
|
|
||||||
&& wget https://chromedriver.storage.googleapis.com/114.0.5735.90/chromedriver_linux64.zip \
|
|
||||||
&& unzip chromedriver_linux64.zip \
|
|
||||||
&& mv chromedriver /usr/local/bin/ \
|
|
||||||
&& chmod +x /usr/local/bin/chromedriver
|
|
||||||
|
|
||||||
RUN useradd python
|
|
||||||
RUN chown -R python /usr/src/app
|
|
||||||
USER python
|
|
||||||
|
|
||||||
CMD ["python", "main.py"]
|
|
24
app/main.py
24
app/main.py
@ -1,12 +1,26 @@
|
|||||||
from scraper import scraper
|
from scraper import scrap
|
||||||
|
import os
|
||||||
|
import json
|
||||||
import time
|
import time
|
||||||
|
|
||||||
|
urls = ["https://digitalprojects.wpi.art/monet/artworks"]
|
||||||
|
hrefs = []
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
print("Starting the application...\n\n")
|
directory = "dist"
|
||||||
scraper()
|
file_path = os.path.join(directory, "data.json")
|
||||||
print("\n\nApplication finished!")
|
scrap(urls[0])
|
||||||
time.sleep(8)
|
|
||||||
|
data = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
os.mkdir("dist")
|
||||||
|
except FileExistsError:
|
||||||
|
pass
|
||||||
|
with open(file_path, "w", encoding="utf-8") as file:
|
||||||
|
json.dump(data, file)
|
||||||
|
print("Data has been scraped!")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -1,8 +1,24 @@
|
|||||||
|
attrs==24.2.0
|
||||||
beautifulsoup4==4.12.3
|
beautifulsoup4==4.12.3
|
||||||
bs4==0.0.2
|
bs4==0.0.2
|
||||||
certifi==2024.8.30
|
certifi==2024.8.30
|
||||||
charset-normalizer==3.4.0
|
charset-normalizer==3.4.0
|
||||||
|
h11==0.14.0
|
||||||
idna==3.10
|
idna==3.10
|
||||||
|
lxml==5.3.0
|
||||||
|
outcome==1.3.0.post0
|
||||||
|
packaging==24.2
|
||||||
|
PySocks==1.7.1
|
||||||
|
python-dotenv==1.0.1
|
||||||
requests==2.32.3
|
requests==2.32.3
|
||||||
|
selenium==4.26.1
|
||||||
|
sniffio==1.3.1
|
||||||
|
sortedcontainers==2.4.0
|
||||||
soupsieve==2.6
|
soupsieve==2.6
|
||||||
|
trio==0.27.0
|
||||||
|
trio-websocket==0.11.1
|
||||||
|
typing_extensions==4.12.2
|
||||||
urllib3==2.2.3
|
urllib3==2.2.3
|
||||||
|
webdriver-manager==4.0.2
|
||||||
|
websocket-client==1.8.0
|
||||||
|
wsproto==1.2.0
|
||||||
|
@ -1,17 +1,78 @@
|
|||||||
import os
|
import os
|
||||||
import json
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.chrome.service import Service
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
|
from selenium.webdriver.support import expected_conditions as EC
|
||||||
|
from webdriver_manager.chrome import ChromeDriverManager
|
||||||
|
import time
|
||||||
|
|
||||||
|
|
||||||
def scraper():
|
class Scraper:
|
||||||
directory = "dist"
|
def __init__(self, url):
|
||||||
file_path = os.path.join(directory, "data.json")
|
self.url = url
|
||||||
|
self.hrefs = []
|
||||||
|
self.driver = self.load_driver()
|
||||||
|
|
||||||
data = []
|
def load_driver(self) -> webdriver.Chrome:
|
||||||
|
options = webdriver.ChromeOptions()
|
||||||
|
options.add_argument("--headless")
|
||||||
|
options.add_argument("--no-sandbox")
|
||||||
|
|
||||||
try:
|
return webdriver.Chrome(
|
||||||
os.mkdir("dist")
|
options=options,
|
||||||
except FileExistsError:
|
service=(
|
||||||
pass
|
Service(ChromeDriverManager().install())
|
||||||
with open(file_path, "w", encoding="utf-8") as file:
|
if os.path.exists("/.dockerenv")
|
||||||
json.dump(data, file)
|
else None
|
||||||
print("Data has been scraped!")
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
def skip_cookies(self) -> None:
|
||||||
|
WebDriverWait(self.driver, 5).until(
|
||||||
|
EC.presence_of_element_located(
|
||||||
|
(By.CSS_SELECTOR, "[_ngcontent-ng-c745257238]")
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
button = self.driver.find_element(By.CSS_SELECTOR, "[_ngcontent-ng-c745257238]")
|
||||||
|
self.driver.execute_script(
|
||||||
|
"""
|
||||||
|
arguments[0].removeAttribute('disabled');
|
||||||
|
arguments[0].className = 'border-button';
|
||||||
|
""",
|
||||||
|
button,
|
||||||
|
)
|
||||||
|
button.click()
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
def load_page(self) -> None:
|
||||||
|
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
def locate_valid_artworks(self) -> list[str]:
|
||||||
|
WebDriverWait(self.driver, 5).until(
|
||||||
|
EC.presence_of_element_located((By.CSS_SELECTOR, ".artwork-search-results"))
|
||||||
|
)
|
||||||
|
artworks = self.driver.find_elements(
|
||||||
|
By.CSS_SELECTOR, ".artwork-search-results article"
|
||||||
|
)
|
||||||
|
for artwork in artworks:
|
||||||
|
href = artwork.find_element(By.CSS_SELECTOR, "a").get_attribute("href")
|
||||||
|
self.hrefs.append(href)
|
||||||
|
return self.hrefs
|
||||||
|
|
||||||
|
|
||||||
|
def scrap(url: str):
|
||||||
|
instance = Scraper(url)
|
||||||
|
driver = instance.driver
|
||||||
|
driver.get(url)
|
||||||
|
|
||||||
|
instance.skip_cookies()
|
||||||
|
instance.load_page()
|
||||||
|
hrefs = instance.locate_valid_artworks()
|
||||||
|
|
||||||
|
print(hrefs)
|
||||||
|
html = driver.page_source
|
||||||
|
driver.quit()
|
||||||
|
return html
|
||||||
|
@ -3,8 +3,6 @@ services:
|
|||||||
build:
|
build:
|
||||||
context: ./app
|
context: ./app
|
||||||
container_name: webscraper
|
container_name: webscraper
|
||||||
depends_on:
|
|
||||||
- redis
|
|
||||||
volumes:
|
volumes:
|
||||||
- ./app:/usr/src/app
|
- ./app:/usr/src/app
|
||||||
develop:
|
develop:
|
||||||
@ -13,14 +11,20 @@ services:
|
|||||||
action: rebuild
|
action: rebuild
|
||||||
- path: ./app
|
- path: ./app
|
||||||
target: /usr/src/app
|
target: /usr/src/app
|
||||||
action: sync
|
action: sync+restart
|
||||||
redis:
|
command: tail -f /dev/null
|
||||||
image: "redis:alpine"
|
selenium-hub:
|
||||||
volumes:
|
image: "selenium/hub:3.141.59"
|
||||||
- redis_data:/data
|
container_name: selenium-hub
|
||||||
ports:
|
ports:
|
||||||
- "6379:6379"
|
- "4444:4444"
|
||||||
|
# redis:
|
||||||
|
# image: "redis:alpine"
|
||||||
|
# volumes:
|
||||||
|
# - redis_data:/data
|
||||||
|
# ports:
|
||||||
|
# - "6379:6379"
|
||||||
|
|
||||||
volumes:
|
volumes:
|
||||||
redis_data:
|
# redis_data:
|
||||||
app:
|
app:
|
||||||
|
Loading…
Reference in New Issue
Block a user