Compare commits

..

4 Commits

10 changed files with 187 additions and 151 deletions

View File

@ -2,18 +2,15 @@
## Description
This project is a web scraper designed to extract data from websites. It can be customized to scrape various types of data and save it in different formats.
This project is a web scraper designed to extract data from websites.
## Features
- Extracts data from web pages
<!-- - Supports multiple data formats (CSV, JSON, etc.)
- Customizable scraping rules
- Error handling and logging -->
☑️ Extracts data from web pages
## Installation
## Usage
### Using Docker
### With Docker
1. Clone the repository:
@ -27,57 +24,51 @@ git clone https://git.wmi.amu.edu.pl/s500042/webscraper
cd webscraper
```
3. Build the Docker image and run it using script:
- On Linux, ?Mac <!-- I haven't tested it yet -->
3. Build the Docker image and run it using `start.py` script:
```bash
./start.sh
python scripts/start.py
```
- Windows 🤡
On Mac, you'll have to use
```bash
python start.py
python3 scripts/start.py
```
This one will work just fine on Linux, but on Mac, you'll have to use
```bash
python3 start.py
```
4. Check `/app/dist/data.json` file to see the extracted data.
### Without Docker
1. Clone the repository:
```bash
git clone https://github.com/yourusername/webscraper.git
git clone https://git.wmi.amu.edu.pl/s500042/webscraper
```
2. Navigate to the project directory:
2. Install the required dependencies:
```bash
cd webscraper/app
```
3. Install the required dependencies:
```bash
pip install -r requirements.txt
pip install -r app/requirements.txt
```
If you're on Arch Linux, you'll need to create a virtual environment.
Here's is a [Step by step guide](#) that will help you create it.
## Usage
1. Configure the scraper by editing the `config.json` file.
2. Run the scraper:
3. Run `run_with_no_docker.py` script:
```bash
python scraper.py
python scripts/run_with_no_docker.py
```
On Mac you'll, need to use:
```bash
python3 scripts/run_with_no_docker.py
```
4. Check `/app/dist/data.json` file to see the extracted data.
## License
This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.

View File

@ -1,33 +0,0 @@
import subprocess
def quitCondition(command: str) -> bool:
return command in ["q", "quit", "exit", "stop"]
def helpCondition(command: str) -> bool:
return command in ["h", "help"]
def clearCondition(command: str) -> bool:
return command in ["c", "clear", "cls"]
def systemCommand(command: str) -> str:
words = command[slice(1, len(command))].split()
if words[0] == "":
return "Command not found. Write 'h' for help."
return subprocess.run(
f'docker exec -it webscraper {" ".join(words)}',
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
).stdout.decode()
def restartCondition(command: str) -> bool:
return command in ["r", "restart"]
def runCondition(command: str) -> bool:
return command in ["run"]

30
scripts/get_path.py Normal file
View File

@ -0,0 +1,30 @@
import os
from run_command import run_command
def get_path():
pwd = run_command("pwd")
splitted = pwd.split("/")
splitted[-1] = splitted[-1].replace("\n", "")
print(splitted[: splitted.index("webscraper") + 1])
if splitted.count("webscraper") > 1 and "webscraper" in splitted:
for i in range(len(splitted) - 1, -1, -1):
potential_path = "/".join(splitted[: i + 1])
if "webscraper" in potential_path:
script_path = f"{potential_path}/scripts"
if os.path.isdir(script_path):
return potential_path
else:
return "This is not a valid webscraper project."
else:
return "/".join(splitted[: splitted.index("webscraper") + 1])
def run_main():
print(get_path())
if __name__ == "__main__":
run_main()

13
scripts/run_command.py Normal file
View File

@ -0,0 +1,13 @@
import subprocess
import sys
def run_command(command: str) -> str:
process = subprocess.run(
command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
if process.returncode != 0:
print(f"Error running command: {command}")
print(process.stderr.decode())
sys.exit(process.returncode)
return process.stdout.decode()

View File

@ -0,0 +1,17 @@
import subprocess
import os
from run_command import run_command
from get_path import get_path
def run_main():
path = get_path()
try:
result = run_command(f"python3 {path}/app/main.py")
print(result)
except subprocess.CalledProcessError as e:
print(f"Error occurred: {e.stderr}")
if __name__ == "__main__":
run_main()

View File

@ -1,31 +1,25 @@
import subprocess
import os
import sys
def run_command(command: str) -> str:
process = subprocess.run(
command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
if process.returncode != 0:
print(f"Error running command: {command}")
print(process.stderr.decode())
sys.exit(process.returncode)
return process.stdout.decode()
from run_command import run_command
def main():
docker_compose_file = os.getenv("DOCKER_COMPOSE_FILE", "./app/docker-compose.yaml")
docker_compose_file = os.getenv(
"DOCKER_COMPOSE_FILE", "$WEBSCRAPER/app/docker-compose.yaml"
)
service_name = os.getenv("SERVICE_NAME", "webscraper")
script_name = os.getenv("SCRIPT_NAME", "main.py")
try:
print("Starting Docker Compose services...\n")
run_command(f"docker compose -f {docker_compose_file} up -d")
print("Starting Docker Compose services...\n")
run_command(f"docker compose -f {docker_compose_file} up -d")
print(run_command(f"docker exec {service_name} python {script_name}"))
print(run_command(f"docker exec {service_name} python {script_name}"))
print("Stopping and removing Docker Compose services...")
run_command(f"docker compose -f {docker_compose_file} down")
print("Stopping and removing Docker Compose services...")
run_command(f"docker compose -f {docker_compose_file} down")
except subprocess.CalledProcessError as e:
print("An error occurred while running the script.")
print(e)
if __name__ == "__main__":

View File

@ -0,0 +1,57 @@
from run_command import run_command
def quitCondition(command: str) -> bool:
return command in ["q", "quit", "exit", "stop"]
def quitService(path: str):
print("Stopping and removing Docker Compose services...")
run_command(f"docker compose -f {path}/app/docker-compose.yaml down")
return None
def helpCondition(command: str) -> bool:
return command in ["h", "help"]
def clearCondition(command: str) -> bool:
return command in ["c", "clear", "cls"]
def systemCommand(command: str) -> str:
words = command[1:].split()
if words[0] == "":
return "Command not found. Write 'h' for help."
print(
run_command(
f'docker exec -it webscraper {" ".join(words)}',
)
)
return None
def restartCondition(command: str) -> bool:
return command in ["r", "restart"]
def restartService(path: str):
print("Restarting Docker Compose services...")
run_command(f"docker compose -f {path}/app/docker-compose.yaml down")
run_command(f"docker compose -f {path}/app/docker-compose.yaml up -d")
print("Composed!")
return None
def runCondition(command: str) -> bool:
return command in ["run"]
def runService():
print("Running main.py...")
print(
run_command(
"docker exec -it webscraper python main.py",
)
)
return None

View File

@ -0,0 +1,9 @@
def help_list():
return """
["h", "help"], - for help.
["q", "quit", "exit", "stop"], - to stop program.
["c", "clear", "cls"], - to clear console.
["r", "restart"], - to restart Docker Compose services.
["run"], - to run main.py in docker container.
["$..."], - to evaluate command in docker container.
"""

View File

@ -1,65 +1,30 @@
import commands
import subprocess
import sys
import threading
from threads.commands import *
from run_command import run_command
from get_path import get_path
from threads.help_list import help_list
def prompt():
while True:
command = input("> ")
if commands.quitCondition(command):
print("Stopping and removing Docker Compose services...")
subprocess.run(
"docker compose -f ../app/docker-compose.yaml down",
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
if quitCondition(command):
quitService(get_path())
break
if commands.helpCondition(command):
print(
"""
["h", "help"], - for help.
["q", "quit", "exit", "stop"], - to stop program.
["c", "clear", "cls"], - to clear console.
["r", "restart"], - to restart Docker Compose services.
["run"], - to run main.py in docker container.
["$..."], - to evaluate command in docker container.
"""
)
if helpCondition(command):
print(help_list())
continue
if commands.clearCondition(command):
print("\n" * 100)
if clearCondition(command):
run_command("clear")
continue
if command.startswith("$"):
print(commands.systemCommand(command))
systemCommand(command)
continue
if commands.restartCondition(command):
print("Restarting Docker Compose services...")
subprocess.run(
"docker compose -f ../app/docker-compose.yaml down",
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
subprocess.run(
"docker compose -f ../app/docker-compose.yaml up -d",
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
print("Composed!")
if restartCondition(command):
restartService(get_path())
continue
if commands.runCondition(command):
print("Running main.py...")
print(
subprocess.run(
"docker exec -it webscraper python main.py",
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
).stdout.decode()
)
if runCondition(command):
runService()
continue
if command == "":
continue

View File

@ -1,26 +1,20 @@
import subprocess
import time
import os
import threading
from threads.prompt import prompt
def run_command(command: str) -> str:
process = subprocess.run(
command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
if process.returncode != 0:
print(f"Error running command: {command}")
return process.stderr.decode()
return process.stdout.decode()
from run_command import run_command
from get_path import get_path
thread = threading.Thread(target=prompt)
def main():
path = get_path()
docker_compose_file = os.getenv(
"DOCKER_COMPOSE_FILE", f"{path}/app/docker-compose.yaml"
)
print("Starting Docker Compose services...")
run_command("docker compose -f ../app/docker-compose.yaml up -d")
run_command(f"docker compose -f {docker_compose_file} up -d")
print("Composed!\n")
print("Running main.py...")
print(run_command("docker exec -it webscraper python main.py"))
@ -31,10 +25,9 @@ def main():
print("\nWatching for changes...")
thread.start()
path_to_watch = "/home/paprykdev/uni/webscraper/app"
before = {
f: os.stat(os.path.join(path_to_watch, f)).st_mtime
for f in os.listdir(path_to_watch)
f: os.stat(os.path.join(path, "app", f)).st_mtime
for f in os.listdir(os.path.join(path, "app"))
if f.endswith(".py")
}
@ -43,8 +36,8 @@ def main():
break
time.sleep(1)
after = {
f: os.stat(os.path.join(path_to_watch, f)).st_mtime
for f in os.listdir(path_to_watch)
f: os.stat(os.path.join(path, "app", f)).st_mtime
for f in os.listdir(os.path.join(path, "app"))
if f.endswith(".py")
}
for f in before: