136 lines
5.2 KiB
Java
136 lines
5.2 KiB
Java
import java.io.IOException;
|
|
import java.io.PrintWriter;
|
|
import java.util.StringTokenizer;
|
|
|
|
import org.jsoup.Connection;
|
|
import org.jsoup.Jsoup;
|
|
import org.jsoup.nodes.Document;
|
|
import org.jsoup.nodes.Element;
|
|
import org.jsoup.select.Elements;
|
|
|
|
|
|
|
|
public class pobranieDanych {
|
|
public static void main(String[] args) throws IOException {
|
|
Connection[] url = new Connection[6];
|
|
int i, j, n = 0, m = 0, iloscOpinii = 5000;
|
|
String slowo;
|
|
|
|
String[] oceny = new String[iloscOpinii];
|
|
|
|
int[] n_dobry = new int[iloscOpinii];
|
|
int[] n_zly = new int[iloscOpinii];
|
|
int[] n_tak = new int[iloscOpinii];
|
|
int[] n_nie = new int[iloscOpinii];
|
|
int[] n_ok = new int[iloscOpinii];
|
|
int[] n_pomocny = new int[iloscOpinii];
|
|
|
|
int[] length = new int[5000];
|
|
|
|
|
|
for(j=0; j<url.length; j++) {
|
|
for (i = 1; i <= 50; i++) {
|
|
url[0] = Jsoup.connect(String.format("https://www.ceneo.pl/27411790/opinie-%d", i));
|
|
url[1] = Jsoup.connect(String.format("https://www.ceneo.pl/39052947/opinie-%d", i));
|
|
url[2] = Jsoup.connect(String.format("https://www.ceneo.pl/245030/opinie-%d", i));
|
|
url[3] = Jsoup.connect(String.format("https://www.ceneo.pl/12400583/opinie-%d", i));
|
|
url[4] = Jsoup.connect(String.format("https://www.ceneo.pl/9365592/opinie-%d", i));
|
|
url[5] = Jsoup.connect(String.format("https://www.ceneo.pl/82456/opinie-%d", i));
|
|
|
|
Document document = url[j].get();
|
|
|
|
Elements review = document.select("div + p[class=\"product-review-body\"]");
|
|
Elements score = document.select("span[class=\"review-score-count\"]");
|
|
|
|
for (Element elem : score) {
|
|
|
|
String ocena = String.valueOf(elem.text());
|
|
ocena = ocena.substring(0,ocena.length()-2);
|
|
|
|
oceny[n] = ocena;
|
|
|
|
n++;
|
|
}
|
|
|
|
for (Element elem : review) {
|
|
|
|
String opinia = elem.text();
|
|
|
|
length[m] = opinia.length();
|
|
|
|
|
|
// tworzymy nowy obiekt klasy StringTokenizer
|
|
StringTokenizer st = new StringTokenizer(opinia, " ,.!()+-*/\\\'\"");
|
|
|
|
// w pętli dopóki są jeszcze tokeny (metoda hasMoreTokens())
|
|
while(st.hasMoreTokens()) {
|
|
slowo = st.nextToken();
|
|
|
|
switch (slowo) {
|
|
case "dobry":
|
|
case "dobre":
|
|
case "dobrze":
|
|
case "dobra":
|
|
n_dobry[m]++;
|
|
break;
|
|
case "zły":
|
|
case "zly":
|
|
case "źle":
|
|
case "zle":
|
|
case "zła":
|
|
case "zla":
|
|
case "złe":
|
|
case "niedobry":
|
|
case "niedobre":
|
|
case "niedobrze":
|
|
case "niedobra":
|
|
n_zly[m]++;
|
|
break;
|
|
case "tak":
|
|
n_tak[m]++;
|
|
break;
|
|
case "nie":
|
|
n_nie[m]++;
|
|
break;
|
|
case "ok":
|
|
case "okej":
|
|
n_ok[m]++;
|
|
break;
|
|
case "pomocny":
|
|
case "pomocne":
|
|
case "pomógł":
|
|
case "pomogl":
|
|
n_pomocny[m]++;
|
|
break;
|
|
}
|
|
}
|
|
|
|
//System.out.println("ocena: " + oceny[m] + ", review: " + opinie[m]);
|
|
m++;
|
|
}
|
|
}
|
|
}
|
|
System.out.println("Liczba opini n: " + n + ", m: " + m);
|
|
|
|
m = 0;
|
|
double train = n * 0.9;
|
|
|
|
PrintWriter zapis1 = new PrintWriter("C:\\Users\\48724\\IdeaProjects\\si\\opinieVW\\train.txt");
|
|
while(train>m) {
|
|
zapis1.println(oceny[m] + " | " + "length:" + length[m] + " ilosc_dobry:" + n_dobry[m] + " ilosc_zly:" + n_zly[m] + " ilosc_tak:" + n_tak[m] + " ilosc_nie:" + n_nie[m] + " ilosc_ok:" + n_ok[m] + " ilosc_pomocny:" + n_pomocny[m]);
|
|
m++;
|
|
}
|
|
zapis1.close();
|
|
|
|
|
|
PrintWriter zapis2 = new PrintWriter("C:\\Users\\48724\\IdeaProjects\\si\\opinieVW\\doOceny.txt");
|
|
while(n>m) {
|
|
zapis2.println(oceny[m] + " | " + "length:" + length[m] + " ilosc_dobry:" + n_dobry[m] + " ilosc_zly:" + n_zly[m] + " ilosc_tak:" + n_tak[m] + " ilosc_nie:" + n_nie[m] + " ilosc_ok:" + n_ok[m] + " ilosc_pomocny:" + n_pomocny[m]);
|
|
m++;
|
|
}
|
|
zapis2.close();
|
|
|
|
|
|
System.out.println("Ilość wierszy w train.txt: " + m);
|
|
}
|
|
} |