OtoMoto webscrapper sourcecode added

This commit is contained in:
Damian Michalski 2020-01-19 03:23:04 +01:00
parent 33f95c77bb
commit 6da93f41f0
4 changed files with 285 additions and 0 deletions

View File

@ -0,0 +1,27 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>groupId</groupId>
<artifactId>Car4You_webscrapper</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<!-- jsoup HTML parser library @ https://jsoup.org/ -->
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.12.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/postgresql/postgresql -->
<dependency>
<groupId>postgresql</groupId>
<artifactId>postgresql</artifactId>
<version>9.1-901-1.jdbc4</version>
</dependency>
</dependencies>
</project>

View File

@ -0,0 +1,122 @@
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.Hashtable;
public class CrawlerThread extends Thread {
public CrawlerThread(){
}
@Override
public void run() {
String pageUrl = "https://www.otomoto.pl/osobowe/?search%5Bfilter_enum_has_registration%5D=1&search%5Bfilter_enum_damaged%5D=0&search%5Bfilter_enum_registered%5D=1&search%5Bfilter_enum_no_accident%5D=1&search%5Border%5D=filter_float_price%3Aasc&search%5Bbrand_program_id%5D%5B0%5D=&search%5Bcountry%5D=&page=";
for (int i = 1; i <= 500; i++) {
Document doc = null;
System.out.println("Strona nr: " + i);
String targetUrl = pageUrl + Integer.valueOf(i);
System.out.println("Link: " + targetUrl);
Boolean success = false; //flaga do sprawdzania czy udało sie połączyc
while (!success) {
success = true;
try {
doc = Jsoup.connect(targetUrl).get();
} catch (IOException e) {
System.out.println("Nie udalo sie polaczyc. " + e.getMessage());
success = false;
}
}
Elements links = doc.select("a.offer-title__link");
for (Element element : links) {
Element photoDiv = element.parent().parent().parent().parent().selectFirst("div").selectFirst("span");
// System.out.println("Promowana: " + photoDiv);
if (photoDiv == null) { //todo sprawdzać czy oferta jest promowana
String url = element.attr("href");
System.out.println("Pobieram: " + url);
Document pageDoc = null;
success = false; //flaga do sprawdzania czy udało sie połączyc
while (!success) {
success = true;
try {
pageDoc = Jsoup.connect(url).get();
} catch (IOException e) {
System.out.println("Nie udalo sie polaczyc. " + e.getMessage());
success = false;
}
}
Elements offerParamsList = pageDoc.select("ul.offer-params__list").select("li.offer-params__item");
Hashtable<String, String> paramsHashtable = new Hashtable<String, String>();
String priceString = pageDoc.select("span.offer-price__number").first().text();
if(!priceString.contains(",")) {
priceString = priceString.substring(0, priceString.length() - 4);
priceString = priceString.replaceAll(" ", "");
}else{
priceString = priceString.replaceAll(",", "");
priceString = priceString.substring(0, priceString.length()-2);
priceString = priceString.substring(0, priceString.length() - 4);
priceString = priceString.replaceAll(" ", "");
}
Integer price = Integer.valueOf(priceString);
for (Element param : offerParamsList) { //wyciąganie wartości parametrów do tablicy
String key = param.select("span.offer-params__label").text();
String value = param.select("div.offer-params__value").text();
paramsHashtable.put(key, value);
}
try {
String brand = paramsHashtable.get("Marka pojazdu");
String model = paramsHashtable.get("Model pojazdu");
String version = paramsHashtable.get("Wersja");
Integer year = Integer.valueOf(paramsHashtable.get("Rok produkcji"));
Integer enginePower = Integer.valueOf(paramsHashtable.get("Moc").split(" ")[0]);
String engineCapacityString = paramsHashtable.get("Pojemność skokowa").replaceAll(" ", "");
engineCapacityString = engineCapacityString.substring(0, engineCapacityString.length() - 3);
Double engineCapacity = Double.valueOf(engineCapacityString);
String fuel = paramsHashtable.get("Rodzaj paliwa");
String transmission = paramsHashtable.get("Skrzynia biegów");
String drive = paramsHashtable.get("Napęd");
String bodyType = paramsHashtable.get("Typ");
Integer doors = Integer.valueOf(paramsHashtable.get("Liczba drzwi"));
Integer seats = Integer.valueOf(paramsHashtable.get("Liczba miejsc"));
if(year == null) year = 0;
if(version == null) version = "nieokreślono";
if(drive == null) drive = "nieokreślono";
if(transmission == null) transmission = "nieokreślono";
// System.out.println("Samochód: " +
// "\nmarka: " + brand +
// "\nmodel: " + model +
// "\nwersja: " + version +
// "\nrok produkcji: " + year +
// "\nmoc silnika: " + enginePower +
// "\npojemnosc skokoa: " + engineCapacity +
// "\nrodzaj paliwa; " + fuel +
// "\nskrzynia biegow: " + transmission +
// "\nnapęd: " + drive +
// "\ntyp nadwozia: " + bodyType +
// "\nliczba drzwi: " + doors +
// "\nliczba miejsc: " + seats +
// "\ncena: " + price);
dbService.insert(brand, model, version, year, enginePower, engineCapacity, fuel, transmission, drive, bodyType, doors, seats, price);
} catch (Exception e) {
System.out.println("Błąd pobierania danych!: " + e.getMessage());
}
// break;
}else{
System.out.println("Oferta promowana. Pomijam. "+element.text());
}
}
}
}
}

View File

@ -0,0 +1,21 @@
import java.util.ArrayList;
public class ThreadManager {
public static void main(String[] args) {
ArrayList<CrawlerThread> threads = new ArrayList();
//tworzenie watkow
for(int i = 0; i < 1; i++){
threads.add(new CrawlerThread());
}
//uruchomienie watkow
for(CrawlerThread t:threads){
t.start();
}
}
}

View File

@ -0,0 +1,115 @@
import java.sql.*;
public class dbService {
// JDBC driver name and database URL
static final String JDBC_DRIVER = "com.postgresql.jdbc.Driver";
static final String DB_URL = "";
// Database credentials
static final String USER = "";
static final String PASS = "";
public static void insert(String brand, String model, String version, Integer year, Integer enginePower, Double engineCapacity, String fuel, String transmission, String drive, String bodyType, Integer doors, Integer seats, Integer price) {
Connection conn = null;
try{
conn = DriverManager.getConnection(DB_URL,USER,PASS);
//najpierw sprwadzac czy takie auto juz jest
String selectString = "" +
"SELECT * " +
"FROM car " +
"WHERE " +
"brand = ? AND " +
"model = ? AND " +
"version = ? AND " +
"engine_power = ? AND " +
"engine_capacity = ? AND " +
"fuel = ? AND " +
"transmission = ? AND " +
"drive = ? AND " +
"body_type = ? AND " +
"doors = ? AND " +
"seats = ?";
PreparedStatement preparedSelect = conn.prepareStatement(selectString);
preparedSelect.setString(1, brand);
preparedSelect.setString(2, model);
preparedSelect.setString(3, version);
preparedSelect.setInt(4, enginePower);
preparedSelect.setDouble(5, engineCapacity);
preparedSelect.setString(6, fuel);
preparedSelect.setString(7, transmission);
preparedSelect.setString(8, drive);
preparedSelect.setString(9, bodyType);
preparedSelect.setInt(10, doors);
preparedSelect.setInt(11, seats);
ResultSet rs = preparedSelect.executeQuery();
if(rs.next()) {
try {
Long repeatedId = rs.getLong("id");
Integer minPrice = rs.getInt("price_from");
Integer maxPrice = rs.getInt("price_to");
Long priceSum = rs.getLong("price_sum");
Long counter = rs.getLong("counter");
Double avarage;
if (price >= maxPrice) {
maxPrice = price;
} else {
minPrice = price;
}
priceSum = priceSum + price;
counter = counter + 1;
avarage = priceSum / (double) counter;
//update z nowymi danymi / zmiana tylko 5 parametrow
String updateString = "" +
"UPDATE car " +
"SET price_from = ?, price_to = ?, price_sum = ?, avarage = ?, counter = ? WHERE id = ?";
PreparedStatement updatePrepare = conn.prepareStatement(updateString);
updatePrepare.setInt(1, minPrice);
updatePrepare.setInt(2, maxPrice);
updatePrepare.setLong(3, priceSum);
updatePrepare.setDouble(4, avarage);
updatePrepare.setLong(5, counter);
updatePrepare.setLong(6, repeatedId);
int rows = updatePrepare.executeUpdate();
//zaktualizowano
System.out.println("Zauktualizowano: " + brand);
}catch (Exception e){
System.out.println("Błąd podczas aktualizacji ogłoszenia: " + e.getMessage());
}
}else{
String sql = "INSERT INTO car (brand, model, version, year, engine_power, engine_capacity, fuel, transmission, drive, body_type, doors, seats, price_from, price_to, price_sum, counter, avarage) " +
"values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)";
PreparedStatement preparedStatement = conn.prepareStatement(sql);
preparedStatement.setString(1, brand);
preparedStatement.setString(2, model);
preparedStatement.setString(3, version);
preparedStatement.setInt(4, year);
preparedStatement.setInt(5, enginePower);
preparedStatement.setDouble(6, engineCapacity);
preparedStatement.setString(7, fuel);
preparedStatement.setString(8, transmission);
preparedStatement.setString(9, drive);
preparedStatement.setString(10, bodyType);
preparedStatement.setInt(11, doors);
preparedStatement.setInt(12, seats);
preparedStatement.setInt(13, price);
preparedStatement.setInt(14, price);
preparedStatement.setInt(15, price);
preparedStatement.setInt(16, 1);
preparedStatement.setDouble(17, price);
int row = preparedStatement.executeUpdate();
System.out.println("Dodano: " + brand);
preparedStatement.close();
conn.close();
}
}catch(SQLException se){
se.printStackTrace();
}
}
}