projekt_2019/pobranieDanych.java

136 lines
5.2 KiB
Java
Raw Permalink Normal View History

2019-06-10 23:03:32 +02:00
import java.io.IOException;
import java.io.PrintWriter;
import java.util.StringTokenizer;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class pobranieDanych {
public static void main(String[] args) throws IOException {
Connection[] url = new Connection[6];
int i, j, n = 0, m = 0, iloscOpinii = 5000;
String slowo;
String[] oceny = new String[iloscOpinii];
int[] n_dobry = new int[iloscOpinii];
int[] n_zly = new int[iloscOpinii];
int[] n_tak = new int[iloscOpinii];
int[] n_nie = new int[iloscOpinii];
int[] n_ok = new int[iloscOpinii];
int[] n_pomocny = new int[iloscOpinii];
int[] length = new int[5000];
for(j=0; j<url.length; j++) {
for (i = 1; i <= 50; i++) {
url[0] = Jsoup.connect(String.format("https://www.ceneo.pl/27411790/opinie-%d", i));
url[1] = Jsoup.connect(String.format("https://www.ceneo.pl/39052947/opinie-%d", i));
url[2] = Jsoup.connect(String.format("https://www.ceneo.pl/245030/opinie-%d", i));
url[3] = Jsoup.connect(String.format("https://www.ceneo.pl/12400583/opinie-%d", i));
url[4] = Jsoup.connect(String.format("https://www.ceneo.pl/9365592/opinie-%d", i));
url[5] = Jsoup.connect(String.format("https://www.ceneo.pl/82456/opinie-%d", i));
Document document = url[j].get();
Elements review = document.select("div + p[class=\"product-review-body\"]");
Elements score = document.select("span[class=\"review-score-count\"]");
for (Element elem : score) {
String ocena = String.valueOf(elem.text());
ocena = ocena.substring(0,ocena.length()-2);
oceny[n] = ocena;
n++;
}
for (Element elem : review) {
String opinia = elem.text();
length[m] = opinia.length();
// tworzymy nowy obiekt klasy StringTokenizer
StringTokenizer st = new StringTokenizer(opinia, " ,.!()+-*/\\\'\"");
// w pętli dopóki są jeszcze tokeny (metoda hasMoreTokens())
while(st.hasMoreTokens()) {
slowo = st.nextToken();
switch (slowo) {
case "dobry":
case "dobre":
case "dobrze":
case "dobra":
n_dobry[m]++;
break;
case "zły":
case "zly":
case "źle":
case "zle":
case "zła":
case "zla":
case "złe":
case "niedobry":
case "niedobre":
case "niedobrze":
case "niedobra":
n_zly[m]++;
break;
case "tak":
n_tak[m]++;
break;
case "nie":
n_nie[m]++;
break;
case "ok":
case "okej":
n_ok[m]++;
break;
case "pomocny":
case "pomocne":
case "pomógł":
case "pomogl":
n_pomocny[m]++;
break;
}
}
//System.out.println("ocena: " + oceny[m] + ", review: " + opinie[m]);
m++;
}
}
}
System.out.println("Liczba opini n: " + n + ", m: " + m);
m = 0;
double train = n * 0.9;
PrintWriter zapis1 = new PrintWriter("C:\\Users\\48724\\IdeaProjects\\si\\opinieVW\\train.txt");
while(train>m) {
zapis1.println(oceny[m] + " | " + "length:" + length[m] + " ilosc_dobry:" + n_dobry[m] + " ilosc_zly:" + n_zly[m] + " ilosc_tak:" + n_tak[m] + " ilosc_nie:" + n_nie[m] + " ilosc_ok:" + n_ok[m] + " ilosc_pomocny:" + n_pomocny[m]);
m++;
}
zapis1.close();
PrintWriter zapis2 = new PrintWriter("C:\\Users\\48724\\IdeaProjects\\si\\opinieVW\\doOceny.txt");
while(n>m) {
zapis2.println(oceny[m] + " | " + "length:" + length[m] + " ilosc_dobry:" + n_dobry[m] + " ilosc_zly:" + n_zly[m] + " ilosc_tak:" + n_tak[m] + " ilosc_nie:" + n_nie[m] + " ilosc_ok:" + n_ok[m] + " ilosc_pomocny:" + n_pomocny[m]);
m++;
}
zapis2.close();
System.out.println("Ilość wierszy w train.txt: " + m);
}
}