Compare commits

..

No commits in common. "ba9e42b9c6f15b5a237a2bd824c2bdc4bbe74ace" and "cc79bb9d9df67407ae026e0720df890bf62013cd" have entirely different histories.

10 changed files with 151 additions and 187 deletions

View File

@ -2,15 +2,18 @@
## Description ## Description
This project is a web scraper designed to extract data from websites. This project is a web scraper designed to extract data from websites. It can be customized to scrape various types of data and save it in different formats.
## Features ## Features
☑️ Extracts data from web pages - Extracts data from web pages
<!-- - Supports multiple data formats (CSV, JSON, etc.)
- Customizable scraping rules
- Error handling and logging -->
## Usage ## Installation
### With Docker ### Using Docker
1. Clone the repository: 1. Clone the repository:
@ -24,51 +27,57 @@ git clone https://git.wmi.amu.edu.pl/s500042/webscraper
cd webscraper cd webscraper
``` ```
3. Build the Docker image and run it using `start.py` script: 3. Build the Docker image and run it using script:
- On Linux, ?Mac <!-- I haven't tested it yet -->
```bash ```bash
python scripts/start.py ./start.sh
``` ```
On Mac, you'll have to use - Windows 🤡
```bash ```bash
python3 scripts/start.py python start.py
``` ```
4. Check `/app/dist/data.json` file to see the extracted data. This one will work just fine on Linux, but on Mac, you'll have to use
```bash
python3 start.py
```
### Without Docker ### Without Docker
1. Clone the repository: 1. Clone the repository:
```bash ```bash
git clone https://git.wmi.amu.edu.pl/s500042/webscraper git clone https://github.com/yourusername/webscraper.git
``` ```
2. Install the required dependencies: 2. Navigate to the project directory:
```bash ```bash
pip install -r app/requirements.txt cd webscraper/app
```
3. Install the required dependencies:
```bash
pip install -r requirements.txt
``` ```
If you're on Arch Linux, you'll need to create a virtual environment. If you're on Arch Linux, you'll need to create a virtual environment.
Here's is a [Step by step guide](#) that will help you create it. Here's is a [Step by step guide](#) that will help you create it.
3. Run `run_with_no_docker.py` script: ## Usage
1. Configure the scraper by editing the `config.json` file.
2. Run the scraper:
```bash ```bash
python scripts/run_with_no_docker.py python scraper.py
``` ```
On Mac you'll, need to use:
```bash
python3 scripts/run_with_no_docker.py
```
4. Check `/app/dist/data.json` file to see the extracted data.
## License ## License
This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details. This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.

33
scripts/commands.py Normal file
View File

@ -0,0 +1,33 @@
import subprocess
def quitCondition(command: str) -> bool:
return command in ["q", "quit", "exit", "stop"]
def helpCondition(command: str) -> bool:
return command in ["h", "help"]
def clearCondition(command: str) -> bool:
return command in ["c", "clear", "cls"]
def systemCommand(command: str) -> str:
words = command[slice(1, len(command))].split()
if words[0] == "":
return "Command not found. Write 'h' for help."
return subprocess.run(
f'docker exec -it webscraper {" ".join(words)}',
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
).stdout.decode()
def restartCondition(command: str) -> bool:
return command in ["r", "restart"]
def runCondition(command: str) -> bool:
return command in ["run"]

View File

@ -1,30 +0,0 @@
import os
from run_command import run_command
def get_path():
pwd = run_command("pwd")
splitted = pwd.split("/")
splitted[-1] = splitted[-1].replace("\n", "")
print(splitted[: splitted.index("webscraper") + 1])
if splitted.count("webscraper") > 1 and "webscraper" in splitted:
for i in range(len(splitted) - 1, -1, -1):
potential_path = "/".join(splitted[: i + 1])
if "webscraper" in potential_path:
script_path = f"{potential_path}/scripts"
if os.path.isdir(script_path):
return potential_path
else:
return "This is not a valid webscraper project."
else:
return "/".join(splitted[: splitted.index("webscraper") + 1])
def run_main():
print(get_path())
if __name__ == "__main__":
run_main()

View File

@ -1,13 +0,0 @@
import subprocess
import sys
def run_command(command: str) -> str:
process = subprocess.run(
command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
if process.returncode != 0:
print(f"Error running command: {command}")
print(process.stderr.decode())
sys.exit(process.returncode)
return process.stdout.decode()

View File

@ -1,17 +0,0 @@
import subprocess
import os
from run_command import run_command
from get_path import get_path
def run_main():
path = get_path()
try:
result = run_command(f"python3 {path}/app/main.py")
print(result)
except subprocess.CalledProcessError as e:
print(f"Error occurred: {e.stderr}")
if __name__ == "__main__":
run_main()

View File

@ -1,15 +1,24 @@
import subprocess import subprocess
import os import os
from run_command import run_command import sys
def run_command(command: str) -> str:
process = subprocess.run(
command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
if process.returncode != 0:
print(f"Error running command: {command}")
print(process.stderr.decode())
sys.exit(process.returncode)
return process.stdout.decode()
def main(): def main():
docker_compose_file = os.getenv( docker_compose_file = os.getenv("DOCKER_COMPOSE_FILE", "./app/docker-compose.yaml")
"DOCKER_COMPOSE_FILE", "$WEBSCRAPER/app/docker-compose.yaml"
)
service_name = os.getenv("SERVICE_NAME", "webscraper") service_name = os.getenv("SERVICE_NAME", "webscraper")
script_name = os.getenv("SCRIPT_NAME", "main.py") script_name = os.getenv("SCRIPT_NAME", "main.py")
try:
print("Starting Docker Compose services...\n") print("Starting Docker Compose services...\n")
run_command(f"docker compose -f {docker_compose_file} up -d") run_command(f"docker compose -f {docker_compose_file} up -d")
@ -17,9 +26,6 @@ def main():
print("Stopping and removing Docker Compose services...") print("Stopping and removing Docker Compose services...")
run_command(f"docker compose -f {docker_compose_file} down") run_command(f"docker compose -f {docker_compose_file} down")
except subprocess.CalledProcessError as e:
print("An error occurred while running the script.")
print(e)
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -1,57 +0,0 @@
from run_command import run_command
def quitCondition(command: str) -> bool:
return command in ["q", "quit", "exit", "stop"]
def quitService(path: str):
print("Stopping and removing Docker Compose services...")
run_command(f"docker compose -f {path}/app/docker-compose.yaml down")
return None
def helpCondition(command: str) -> bool:
return command in ["h", "help"]
def clearCondition(command: str) -> bool:
return command in ["c", "clear", "cls"]
def systemCommand(command: str) -> str:
words = command[1:].split()
if words[0] == "":
return "Command not found. Write 'h' for help."
print(
run_command(
f'docker exec -it webscraper {" ".join(words)}',
)
)
return None
def restartCondition(command: str) -> bool:
return command in ["r", "restart"]
def restartService(path: str):
print("Restarting Docker Compose services...")
run_command(f"docker compose -f {path}/app/docker-compose.yaml down")
run_command(f"docker compose -f {path}/app/docker-compose.yaml up -d")
print("Composed!")
return None
def runCondition(command: str) -> bool:
return command in ["run"]
def runService():
print("Running main.py...")
print(
run_command(
"docker exec -it webscraper python main.py",
)
)
return None

View File

@ -1,9 +0,0 @@
def help_list():
return """
["h", "help"], - for help.
["q", "quit", "exit", "stop"], - to stop program.
["c", "clear", "cls"], - to clear console.
["r", "restart"], - to restart Docker Compose services.
["run"], - to run main.py in docker container.
["$..."], - to evaluate command in docker container.
"""

View File

@ -1,30 +1,65 @@
import commands
import subprocess
import sys import sys
from threads.commands import * import threading
from run_command import run_command
from get_path import get_path
from threads.help_list import help_list
def prompt(): def prompt():
while True: while True:
command = input("> ") command = input("> ")
if quitCondition(command): if commands.quitCondition(command):
quitService(get_path()) print("Stopping and removing Docker Compose services...")
subprocess.run(
"docker compose -f ../app/docker-compose.yaml down",
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
break break
if helpCondition(command): if commands.helpCondition(command):
print(help_list()) print(
"""
["h", "help"], - for help.
["q", "quit", "exit", "stop"], - to stop program.
["c", "clear", "cls"], - to clear console.
["r", "restart"], - to restart Docker Compose services.
["run"], - to run main.py in docker container.
["$..."], - to evaluate command in docker container.
"""
)
continue continue
if clearCondition(command): if commands.clearCondition(command):
run_command("clear") print("\n" * 100)
continue continue
if command.startswith("$"): if command.startswith("$"):
systemCommand(command) print(commands.systemCommand(command))
continue continue
if restartCondition(command): if commands.restartCondition(command):
restartService(get_path()) print("Restarting Docker Compose services...")
subprocess.run(
"docker compose -f ../app/docker-compose.yaml down",
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
subprocess.run(
"docker compose -f ../app/docker-compose.yaml up -d",
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
print("Composed!")
continue continue
if runCondition(command): if commands.runCondition(command):
runService() print("Running main.py...")
print(
subprocess.run(
"docker exec -it webscraper python main.py",
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
).stdout.decode()
)
continue continue
if command == "": if command == "":
continue continue

View File

@ -1,20 +1,26 @@
import subprocess
import time import time
import os import os
import threading import threading
from threads.prompt import prompt from threads.prompt import prompt
from run_command import run_command
from get_path import get_path
def run_command(command: str) -> str:
process = subprocess.run(
command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
if process.returncode != 0:
print(f"Error running command: {command}")
return process.stderr.decode()
return process.stdout.decode()
thread = threading.Thread(target=prompt) thread = threading.Thread(target=prompt)
def main(): def main():
path = get_path()
docker_compose_file = os.getenv(
"DOCKER_COMPOSE_FILE", f"{path}/app/docker-compose.yaml"
)
print("Starting Docker Compose services...") print("Starting Docker Compose services...")
run_command(f"docker compose -f {docker_compose_file} up -d") run_command("docker compose -f ../app/docker-compose.yaml up -d")
print("Composed!\n") print("Composed!\n")
print("Running main.py...") print("Running main.py...")
print(run_command("docker exec -it webscraper python main.py")) print(run_command("docker exec -it webscraper python main.py"))
@ -25,9 +31,10 @@ def main():
print("\nWatching for changes...") print("\nWatching for changes...")
thread.start() thread.start()
path_to_watch = "/home/paprykdev/uni/webscraper/app"
before = { before = {
f: os.stat(os.path.join(path, "app", f)).st_mtime f: os.stat(os.path.join(path_to_watch, f)).st_mtime
for f in os.listdir(os.path.join(path, "app")) for f in os.listdir(path_to_watch)
if f.endswith(".py") if f.endswith(".py")
} }
@ -36,8 +43,8 @@ def main():
break break
time.sleep(1) time.sleep(1)
after = { after = {
f: os.stat(os.path.join(path, "app", f)).st_mtime f: os.stat(os.path.join(path_to_watch, f)).st_mtime
for f in os.listdir(os.path.join(path, "app")) for f in os.listdir(path_to_watch)
if f.endswith(".py") if f.endswith(".py")
} }
for f in before: for f in before: