- moved all code that contained MIT IP outside (http://code.google.com/p/simile-vicino/)
- moved bzip2 and tar code from apache ant into their own jar files - now gridworks source contains only com.metaweb.* code everything else is a jar dependency - started to work on archive importer git-svn-id: http://google-refine.googlecode.com/svn/trunk@376 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
parent
4eda7ae2c0
commit
72203cd3d5
@ -15,11 +15,12 @@
|
|||||||
<classpathentry kind="lib" path="lib/arithcode-1.1.jar" sourcepath="lib-src/arithcode-1.1-sources.jar"/>
|
<classpathentry kind="lib" path="lib/arithcode-1.1.jar" sourcepath="lib-src/arithcode-1.1-sources.jar"/>
|
||||||
<classpathentry kind="lib" path="lib/jdatapath-alpha2.jar" sourcepath="lib-src/jdatapath-alpha2-sources.jar"/>
|
<classpathentry kind="lib" path="lib/jdatapath-alpha2.jar" sourcepath="lib-src/jdatapath-alpha2-sources.jar"/>
|
||||||
<classpathentry kind="lib" path="lib/secondstring-20100303.jar" sourcepath="lib-src/secondstring-20100303-sources.jar"/>
|
<classpathentry kind="lib" path="lib/secondstring-20100303.jar" sourcepath="lib-src/secondstring-20100303-sources.jar"/>
|
||||||
|
<classpathentry kind="lib" path="lib/ant-tools-1.8.0.jar" sourcepath="lib-src/ant-tools-1.8.0-sources.jar"/>
|
||||||
|
<classpathentry kind="lib" path="lib/vicino-1.1.jar" sourcepath="lib-src/vicino-1.1-sources.jar"/>
|
||||||
<classpathentry kind="lib" path="lib/poi-3.6.jar"/>
|
<classpathentry kind="lib" path="lib/poi-3.6.jar"/>
|
||||||
<classpathentry kind="lib" path="lib/poi-ooxml-3.6.jar"/>
|
<classpathentry kind="lib" path="lib/poi-ooxml-3.6.jar"/>
|
||||||
<classpathentry kind="lib" path="lib/apache-tools-tar.jar"/>
|
|
||||||
<classpathentry kind="lib" path="tests/lib/junit-4.8.1.jar" sourcepath="tests/lib-src/junit-4.8.1-sources.jar"/>
|
|
||||||
<classpathentry kind="lib" path="lib/jython-2.5.1.jar"/>
|
<classpathentry kind="lib" path="lib/jython-2.5.1.jar"/>
|
||||||
<classpathentry kind="lib" path="lib/clojure-1.1.0.jar"/>
|
<classpathentry kind="lib" path="lib/clojure-1.1.0.jar"/>
|
||||||
|
<classpathentry kind="lib" path="tests/lib/junit-4.8.1.jar" sourcepath="tests/lib-src/junit-4.8.1-sources.jar"/>
|
||||||
<classpathentry kind="output" path="build/classes"/>
|
<classpathentry kind="output" path="build/classes"/>
|
||||||
</classpath>
|
</classpath>
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2010, Metaweb Technologies, Inc. All rights reserved.
|
* Copyright (c) 2010 Metaweb Technologies, Inc. All rights reserved.
|
||||||
*
|
*
|
||||||
* Redistribution and use in source and binary forms, with or without
|
* Redistribution and use in source and binary forms, with or without
|
||||||
* modification, are permitted provided that the following conditions
|
* modification, are permitted provided that the following conditions
|
||||||
@ -36,9 +36,9 @@ See the 'licenses' directory for a list of the licenses for the libraries we dep
|
|||||||
ordered here by license:
|
ordered here by license:
|
||||||
|
|
||||||
licenses/apache2.0.LICENSE.txt
|
licenses/apache2.0.LICENSE.txt
|
||||||
ant (package org.apache.tools.tar)
|
|
||||||
bzip2 (package org.apache.tools.bzip2)
|
|
||||||
calendar-parser (package com.metaweb.gridworks.expr.util)
|
calendar-parser (package com.metaweb.gridworks.expr.util)
|
||||||
|
ant-tools
|
||||||
|
bzip2
|
||||||
commons-lang
|
commons-lang
|
||||||
commons-codec
|
commons-codec
|
||||||
jdatapath
|
jdatapath
|
||||||
@ -58,7 +58,7 @@ licenses/dom4j.LICENSE.txt (BSD family)
|
|||||||
dom4j
|
dom4j
|
||||||
|
|
||||||
licenses/simile.LICENSE.txt (BSD family)
|
licenses/simile.LICENSE.txt (BSD family)
|
||||||
vicino (package edu.mit.simile.vicino)
|
vicino
|
||||||
|
|
||||||
licenses/arithcode.LICENSE.txt (BSD family)
|
licenses/arithcode.LICENSE.txt (BSD family)
|
||||||
arithcode
|
arithcode
|
||||||
|
BIN
lib-src/ant-tools-1.8.0-sources.jar
Normal file
BIN
lib-src/ant-tools-1.8.0-sources.jar
Normal file
Binary file not shown.
BIN
lib-src/vicino-1.1-sources.jar
Normal file
BIN
lib-src/vicino-1.1-sources.jar
Normal file
Binary file not shown.
BIN
lib/ant-tools-1.8.0.jar
Normal file
BIN
lib/ant-tools-1.8.0.jar
Normal file
Binary file not shown.
Binary file not shown.
BIN
lib/vicino-1.1.jar
Normal file
BIN
lib/vicino-1.1.jar
Normal file
Binary file not shown.
@ -37,6 +37,7 @@ import com.metaweb.util.threads.ThreadPoolExecutorAdapter;
|
|||||||
public class Gridworks {
|
public class Gridworks {
|
||||||
|
|
||||||
static private String version;
|
static private String version;
|
||||||
|
static private File tempDir;
|
||||||
|
|
||||||
private static Logger root = Logger.getRootLogger();
|
private static Logger root = Logger.getRootLogger();
|
||||||
private static Logger logger = Logger.getLogger("com.metaweb.gridworks");
|
private static Logger logger = Logger.getLogger("com.metaweb.gridworks");
|
||||||
@ -65,6 +66,10 @@ public class Gridworks {
|
|||||||
return version;
|
return version;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static File getTempFile(String name) {
|
||||||
|
return new File(tempDir, name);
|
||||||
|
}
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
public static void main(String[] args) throws Exception {
|
||||||
|
|
||||||
// tell jetty to use SLF4J for logging instead of its own stuff
|
// tell jetty to use SLF4J for logging instead of its own stuff
|
||||||
@ -84,6 +89,9 @@ public class Gridworks {
|
|||||||
jetty_logger.setLevel(Level.WARN);
|
jetty_logger.setLevel(Level.WARN);
|
||||||
|
|
||||||
version = Configurations.get("gridworks.version","trunk");
|
version = Configurations.get("gridworks.version","trunk");
|
||||||
|
|
||||||
|
tempDir = new File(Configurations.get("gridworks.temp","temp"));
|
||||||
|
if (!tempDir.exists()) tempDir.mkdirs();
|
||||||
|
|
||||||
Gridworks gridworks = new Gridworks();
|
Gridworks gridworks = new Gridworks();
|
||||||
|
|
||||||
|
@ -1,6 +1,9 @@
|
|||||||
package com.metaweb.gridworks.commands.edit;
|
package com.metaweb.gridworks.commands.edit;
|
||||||
|
|
||||||
import java.io.BufferedInputStream;
|
import java.io.BufferedInputStream;
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileInputStream;
|
||||||
|
import java.io.FileOutputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.io.InputStreamReader;
|
import java.io.InputStreamReader;
|
||||||
@ -9,12 +12,24 @@ import java.io.StringReader;
|
|||||||
import java.io.UnsupportedEncodingException;
|
import java.io.UnsupportedEncodingException;
|
||||||
import java.net.URL;
|
import java.net.URL;
|
||||||
import java.net.URLConnection;
|
import java.net.URLConnection;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.List;
|
||||||
import java.util.Properties;
|
import java.util.Properties;
|
||||||
|
import java.util.Map.Entry;
|
||||||
|
import java.util.zip.GZIPInputStream;
|
||||||
|
|
||||||
import javax.servlet.ServletException;
|
import javax.servlet.ServletException;
|
||||||
import javax.servlet.http.HttpServletRequest;
|
import javax.servlet.http.HttpServletRequest;
|
||||||
import javax.servlet.http.HttpServletResponse;
|
import javax.servlet.http.HttpServletResponse;
|
||||||
|
|
||||||
|
import org.apache.tools.bzip2.CBZip2InputStream;
|
||||||
|
import org.apache.tools.tar.TarEntry;
|
||||||
|
import org.apache.tools.tar.TarInputStream;
|
||||||
|
|
||||||
import com.ibm.icu.text.CharsetDetector;
|
import com.ibm.icu.text.CharsetDetector;
|
||||||
import com.ibm.icu.text.CharsetMatch;
|
import com.ibm.icu.text.CharsetMatch;
|
||||||
import com.metaweb.gridworks.Gridworks;
|
import com.metaweb.gridworks.Gridworks;
|
||||||
@ -91,7 +106,9 @@ public class CreateProjectCommand extends Command {
|
|||||||
while ((part = parser.readNextPart()) != null) {
|
while ((part = parser.readNextPart()) != null) {
|
||||||
|
|
||||||
if (part.isFile()) {
|
if (part.isFile()) {
|
||||||
internalImportFilePart((FilePart) part, project, options);
|
|
||||||
|
FilePart filePart = (FilePart) part;
|
||||||
|
internalImportFile(project, options, filePart.getFileName(), filePart.getInputStream());
|
||||||
|
|
||||||
} else if (part.isParam()) {
|
} else if (part.isParam()) {
|
||||||
ParamPart paramPart = (ParamPart) part;
|
ParamPart paramPart = (ParamPart) part;
|
||||||
@ -118,15 +135,120 @@ public class CreateProjectCommand extends Command {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void internalImportFilePart(
|
protected void internalImportFile(
|
||||||
FilePart filePart,
|
|
||||||
Project project,
|
Project project,
|
||||||
Properties options
|
Properties options,
|
||||||
|
String fileName,
|
||||||
|
InputStream inputStream
|
||||||
) throws Exception {
|
) throws Exception {
|
||||||
|
|
||||||
Importer importer = guessImporter(options, null, filePart.getFileName());
|
if (fileName.endsWith(".tar.gz") || fileName.endsWith(".tar.bz2")) {
|
||||||
|
// first, save the file on disk, since we need two passes and we might
|
||||||
internalInvokeImporter(project, importer, options, filePart.getInputStream(), null);
|
// not have enough memory to keep it all in there
|
||||||
|
File file = save(inputStream);
|
||||||
|
|
||||||
|
// in the first pass, gather statistics about what files are in there
|
||||||
|
// unfortunately, we have to rely on files extensions, which is horrible but
|
||||||
|
// better than nothing
|
||||||
|
BufferedInputStream stream = new BufferedInputStream(new FileInputStream(file));
|
||||||
|
InputStream is = (fileName.endsWith(".tar.gz")) ? new GZIPInputStream(stream): new CBZip2InputStream(stream);
|
||||||
|
TarInputStream tis = new TarInputStream(is);
|
||||||
|
HashMap<String,Integer> ext_map = new HashMap<String,Integer>();
|
||||||
|
while (true) {
|
||||||
|
TarEntry entry = tis.getNextEntry();
|
||||||
|
if (entry == null) break;
|
||||||
|
if (!entry.isDirectory()) {
|
||||||
|
String name = entry.getName();
|
||||||
|
String ext = getExtension(name)[1];
|
||||||
|
if (ext_map.containsKey(ext)) {
|
||||||
|
ext_map.put(ext, ext_map.get(ext) + 1);
|
||||||
|
} else {
|
||||||
|
ext_map.put(ext, 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
stream.close();
|
||||||
|
|
||||||
|
// sort extensions by how often they appear
|
||||||
|
List<Entry<String,Integer>> values = new ArrayList<Entry<String,Integer>>(ext_map.entrySet());
|
||||||
|
Collections.sort(values, new ValuesComparator());
|
||||||
|
|
||||||
|
if (values.size() == 0) {
|
||||||
|
throw new RuntimeException("The archive contains no files.");
|
||||||
|
}
|
||||||
|
|
||||||
|
// this will contain the set of extensions we'll load from the archive
|
||||||
|
HashSet<String> exts = new HashSet<String>();
|
||||||
|
|
||||||
|
// find the extension that is most frequent or those who share the highest frequency value
|
||||||
|
Entry<String,Integer> most_frequent = values.get(0);
|
||||||
|
Entry<String,Integer> second_most_frequent = values.get(1);
|
||||||
|
if (most_frequent.getValue() > second_most_frequent.getValue()) { // we have a winner
|
||||||
|
exts.add(most_frequent.getKey());
|
||||||
|
} else { // multiple extensions have the same frequency
|
||||||
|
int winning_frequency = most_frequent.getValue();
|
||||||
|
for (Entry<String,Integer> e : values) {
|
||||||
|
if (e.getValue() == winning_frequency) {
|
||||||
|
exts.add(e.getKey());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Gridworks.log("Most frequent extensions: " + exts.toString());
|
||||||
|
|
||||||
|
|
||||||
|
} else if (fileName.endsWith(".zip")) {
|
||||||
|
|
||||||
|
} else if (fileName.endsWith(".gz")) {
|
||||||
|
String[] frags = getExtension(fileName);
|
||||||
|
internalImportFile(project, options, frags[0], new GZIPInputStream(inputStream));
|
||||||
|
} else if (fileName.endsWith(".bz2")) {
|
||||||
|
String[] frags = getExtension(fileName);
|
||||||
|
internalImportFile(project, options, frags[0], new CBZip2InputStream(inputStream));
|
||||||
|
} else {
|
||||||
|
load(project, options, fileName, inputStream);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public class ValuesComparator implements Comparator<Entry<String,Integer>> {
|
||||||
|
public int compare(Entry<String,Integer> o1, Entry<String,Integer> o2) {
|
||||||
|
return o2.getValue() - o1.getValue();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void load(Project project, Properties options, String fileName, InputStream inputStream) throws Exception {
|
||||||
|
Importer importer = guessImporter(options, null, fileName);
|
||||||
|
internalInvokeImporter(project, importer, options, inputStream, null);
|
||||||
|
inputStream.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
private File save(InputStream is) throws IOException {
|
||||||
|
File temp = Gridworks.getTempFile(Long.toString(System.currentTimeMillis()));
|
||||||
|
temp.deleteOnExit();
|
||||||
|
copy(is,temp);
|
||||||
|
is.close();
|
||||||
|
return temp;
|
||||||
|
}
|
||||||
|
|
||||||
|
private String[] getExtension(String filename) {
|
||||||
|
String[] result = new String[2];
|
||||||
|
int ext_index = filename.lastIndexOf(".");
|
||||||
|
result[0] = (ext_index == -1) ? filename : filename.substring(0,ext_index);
|
||||||
|
result[1] = (ext_index == -1) ? "" : filename.substring(ext_index + 1);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static long copy(InputStream input, File file) throws IOException {
|
||||||
|
FileOutputStream output = new FileOutputStream(file);
|
||||||
|
byte[] buffer = new byte[4 * 1024];
|
||||||
|
long count = 0;
|
||||||
|
int n = 0;
|
||||||
|
while (-1 != (n = input.read(buffer))) {
|
||||||
|
output.write(buffer, 0, n);
|
||||||
|
count += n;
|
||||||
|
}
|
||||||
|
output.close();
|
||||||
|
input.close();
|
||||||
|
return count;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void internalImportURL(
|
protected void internalImportURL(
|
||||||
@ -237,17 +359,9 @@ public class CreateProjectCommand extends Command {
|
|||||||
new InputStreamReader(inputStream);
|
new InputStreamReader(inputStream);
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
importer.read(reader, project, options, skip, limit);
|
||||||
importer.read(reader, project, options, skip, limit);
|
|
||||||
} finally {
|
|
||||||
reader.close();
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
try {
|
importer.read(inputStream, project, options, skip, limit);
|
||||||
importer.read(inputStream, project, options, skip, limit);
|
|
||||||
} finally {
|
|
||||||
inputStream.close();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -295,6 +409,12 @@ public class CreateProjectCommand extends Command {
|
|||||||
return new ExcelImporter(false);
|
return new ExcelImporter(false);
|
||||||
} else if("application/x-xls".equals(contentType)) {
|
} else if("application/x-xls".equals(contentType)) {
|
||||||
return new ExcelImporter(true);
|
return new ExcelImporter(true);
|
||||||
|
} else if("application/xml".equals(contentType) ||
|
||||||
|
"text/xml".equals(contentType) ||
|
||||||
|
"application/rss+xml".equals(contentType) ||
|
||||||
|
"application/atom+xml".equals(contentType) ||
|
||||||
|
"application/rdf+xml".equals(contentType)) {
|
||||||
|
return new XmlImporter();
|
||||||
}
|
}
|
||||||
} else if (fileName != null) {
|
} else if (fileName != null) {
|
||||||
fileName = fileName.toLowerCase();
|
fileName = fileName.toLowerCase();
|
||||||
|
@ -1,79 +0,0 @@
|
|||||||
package edu.mit.simile.vicino;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import edu.mit.simile.vicino.clustering.Clusterer;
|
|
||||||
import edu.mit.simile.vicino.clustering.NGramClusterer;
|
|
||||||
import edu.mit.simile.vicino.clustering.VPTreeClusterer;
|
|
||||||
import edu.mit.simile.vicino.distances.Distance;
|
|
||||||
|
|
||||||
public class Cluster extends Operator {
|
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
|
||||||
(new Cluster()).init(args);
|
|
||||||
}
|
|
||||||
|
|
||||||
public void init(String[] args) throws Exception {
|
|
||||||
Distance distance = getDistance(args[0]);
|
|
||||||
List<String> strings = getStrings(args[1]);
|
|
||||||
double radius = Double.parseDouble(args[2]);
|
|
||||||
int blocking_size = Integer.parseInt(args[3]);
|
|
||||||
|
|
||||||
long vptree_start = System.currentTimeMillis();
|
|
||||||
Clusterer vptree_clusterer = new VPTreeClusterer(distance);
|
|
||||||
for (String s: strings) {
|
|
||||||
vptree_clusterer.populate(s);
|
|
||||||
}
|
|
||||||
List<Set<Serializable>> vptree_clusters = vptree_clusterer.getClusters(radius);
|
|
||||||
long vptree_elapsed = System.currentTimeMillis() - vptree_start;
|
|
||||||
int vptree_distances = distance.getCount();
|
|
||||||
distance.resetCounter();
|
|
||||||
|
|
||||||
long ngram_start = System.currentTimeMillis();
|
|
||||||
Clusterer ngram_clusterer = new NGramClusterer(distance,blocking_size);
|
|
||||||
for (String s: strings) {
|
|
||||||
ngram_clusterer.populate(s);
|
|
||||||
}
|
|
||||||
List<Set<Serializable>> ngram_clusters = ngram_clusterer.getClusters(radius);
|
|
||||||
long ngram_elapsed = System.currentTimeMillis() - ngram_start;
|
|
||||||
int ngram_distances = distance.getCount();
|
|
||||||
distance.resetCounter();
|
|
||||||
|
|
||||||
log("VPTree found " + vptree_clusters.size() + " in " + vptree_elapsed + " ms with " + vptree_distances + " distances\n");
|
|
||||||
log("NGram found " + ngram_clusters.size() + " in " + ngram_elapsed + " ms with " + ngram_distances + " distances\n");
|
|
||||||
|
|
||||||
if (vptree_clusters.size() > ngram_clusters.size()) {
|
|
||||||
log("VPTree clusterer found these clusters the other method couldn't: ");
|
|
||||||
diff(vptree_clusters,ngram_clusters);
|
|
||||||
} else if (ngram_clusters.size() > vptree_clusters.size()) {
|
|
||||||
log("NGram clusterer found these clusters the other method couldn't: ");
|
|
||||||
diff(ngram_clusters,vptree_clusters);
|
|
||||||
}
|
|
||||||
|
|
||||||
System.exit(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
private void diff(List<Set<Serializable>> more, List<Set<Serializable>> base) {
|
|
||||||
Set<Set<Serializable>> holder = new HashSet<Set<Serializable>>(base.size());
|
|
||||||
|
|
||||||
for (Set<Serializable> s : base) {
|
|
||||||
holder.add(s);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (Set<Serializable> s : more) {
|
|
||||||
if (!holder.contains(s)) {
|
|
||||||
printCluster(s);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void printCluster(Set<Serializable> cluster) {
|
|
||||||
for (Serializable s : cluster) {
|
|
||||||
log(s.toString());
|
|
||||||
}
|
|
||||||
log("");
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,61 +0,0 @@
|
|||||||
package edu.mit.simile.vicino;
|
|
||||||
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import edu.mit.simile.vicino.distances.Distance;
|
|
||||||
|
|
||||||
public class Distributor extends Operator {
|
|
||||||
|
|
||||||
private static final int COLUMNS = 70;
|
|
||||||
private static final char CHAR = '=';
|
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
|
||||||
|
|
||||||
Distance d = getDistance(args[0]);
|
|
||||||
|
|
||||||
List<String> strings = getStrings(args[1]);
|
|
||||||
|
|
||||||
int buckets = Integer.parseInt(args[2]);
|
|
||||||
|
|
||||||
long start = System.currentTimeMillis();
|
|
||||||
int[] values = new int[buckets];
|
|
||||||
|
|
||||||
int size = strings.size();
|
|
||||||
for (int i = 0; i < size; i++) {
|
|
||||||
String x = (String) strings.get((int) (Math.random() * size));
|
|
||||||
String y = (String) strings.get((int) (Math.random() * size));
|
|
||||||
int dist = (int) (buckets * d.d(x, y));
|
|
||||||
values[dist]++;
|
|
||||||
System.out.print(".");
|
|
||||||
}
|
|
||||||
System.out.println();
|
|
||||||
|
|
||||||
long stop = System.currentTimeMillis();
|
|
||||||
float m = ((float) (stop - start)) / (float) size;
|
|
||||||
|
|
||||||
int maxValue = 0;
|
|
||||||
for (int i = 0; i < buckets; i++) {
|
|
||||||
if (values[i] > maxValue) {
|
|
||||||
maxValue = values[i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
System.out
|
|
||||||
.println("+-------------------------------------------------------------------");
|
|
||||||
for (int i = 0; i < buckets; i++) {
|
|
||||||
System.out.println("|" + bar(COLUMNS * values[i] / maxValue));
|
|
||||||
}
|
|
||||||
System.out
|
|
||||||
.println("+-------------------------------------------------------------------");
|
|
||||||
|
|
||||||
System.out.println("\n Each distance calculation took: " + m + " millis");
|
|
||||||
}
|
|
||||||
|
|
||||||
static private String bar(int value) {
|
|
||||||
StringBuffer b = new StringBuffer(value);
|
|
||||||
for (int i = 0; i < value; i++) {
|
|
||||||
b.append(CHAR);
|
|
||||||
}
|
|
||||||
return b.toString();
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,12 +0,0 @@
|
|||||||
package edu.mit.simile.vicino;
|
|
||||||
|
|
||||||
import edu.mit.simile.vicino.distances.Distance;
|
|
||||||
|
|
||||||
public class Meter extends Operator {
|
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
|
||||||
Distance d = getDistance(args[0]);
|
|
||||||
System.out.println(args[1] + " <- " + d.d(args[1], args[2]) + " -> " + args[2]);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,94 +0,0 @@
|
|||||||
package edu.mit.simile.vicino;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.TreeMap;
|
|
||||||
import java.util.regex.Pattern;
|
|
||||||
|
|
||||||
import com.wcohen.ss.api.Token;
|
|
||||||
import com.wcohen.ss.api.Tokenizer;
|
|
||||||
|
|
||||||
public class NGramTokenizer implements Tokenizer {
|
|
||||||
|
|
||||||
private int ngram_size;
|
|
||||||
|
|
||||||
public NGramTokenizer(int ngram_size) {
|
|
||||||
this.ngram_size = ngram_size;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Token[] tokenize(String str) {
|
|
||||||
str = normalize(str);
|
|
||||||
List<Token> tokens = new ArrayList<Token>();
|
|
||||||
for (int i = 0; i < str.length(); i++) {
|
|
||||||
int index = i + ngram_size;
|
|
||||||
if (index <= str.length()) {
|
|
||||||
tokens.add(intern(str.substring(i,index)));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return (Token[]) tokens.toArray(new BasicToken[tokens.size()]);
|
|
||||||
}
|
|
||||||
|
|
||||||
static final Pattern extra = Pattern.compile("\\p{Cntrl}|\\p{Punct}");
|
|
||||||
static final Pattern whitespace = Pattern.compile("\\p{Space}+");
|
|
||||||
|
|
||||||
private String normalize(String s) {
|
|
||||||
s = s.trim();
|
|
||||||
s = extra.matcher(s).replaceAll("");
|
|
||||||
s = whitespace.matcher(s).replaceAll(" ");
|
|
||||||
s = s.toLowerCase();
|
|
||||||
return s.intern();
|
|
||||||
}
|
|
||||||
|
|
||||||
private int nextId = 0;
|
|
||||||
private Map<String, Token> tokMap = new TreeMap<String, Token>();
|
|
||||||
|
|
||||||
public Token intern(String s) {
|
|
||||||
s = s.toLowerCase().intern();
|
|
||||||
Token tok = tokMap.get(s);
|
|
||||||
if (tok == null) {
|
|
||||||
tok = new BasicToken(++nextId, s);
|
|
||||||
tokMap.put(s, tok);
|
|
||||||
}
|
|
||||||
return tok;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Iterator<Token> tokenIterator() {
|
|
||||||
return tokMap.values().iterator();
|
|
||||||
}
|
|
||||||
|
|
||||||
public int maxTokenIndex() {
|
|
||||||
return nextId;
|
|
||||||
}
|
|
||||||
|
|
||||||
public class BasicToken implements Token, Comparable<Token> {
|
|
||||||
private final int index;
|
|
||||||
private final String value;
|
|
||||||
|
|
||||||
BasicToken(int index, String value) {
|
|
||||||
this.index = index;
|
|
||||||
this.value = value;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getValue() {
|
|
||||||
return value;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int getIndex() {
|
|
||||||
return index;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int compareTo(Token t) {
|
|
||||||
return index - t.getIndex();
|
|
||||||
}
|
|
||||||
|
|
||||||
public int hashCode() {
|
|
||||||
return value.hashCode();
|
|
||||||
}
|
|
||||||
|
|
||||||
public String toString() {
|
|
||||||
return "[token#" + getIndex() + ":" + getValue() + "]";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,47 +0,0 @@
|
|||||||
package edu.mit.simile.vicino;
|
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.FileInputStream;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.InputStreamReader;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import edu.mit.simile.vicino.distances.Distance;
|
|
||||||
|
|
||||||
public class Operator {
|
|
||||||
|
|
||||||
static void log(String msg) {
|
|
||||||
System.out.println(msg);
|
|
||||||
}
|
|
||||||
|
|
||||||
static Distance getDistance(String distance) throws Exception {
|
|
||||||
return (Distance) Class.forName("edu.mit.simile.vicino.distances." + distance + "Distance").newInstance();
|
|
||||||
}
|
|
||||||
|
|
||||||
static List<String> getStrings(String fileName) throws IOException {
|
|
||||||
List<String> strings = new ArrayList<String>();
|
|
||||||
|
|
||||||
File file = new File(fileName);
|
|
||||||
if (file.isDirectory()) {
|
|
||||||
File[] files = file.listFiles();
|
|
||||||
for (File f : files) {
|
|
||||||
getStrings(f, strings);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
getStrings(file, strings);
|
|
||||||
}
|
|
||||||
|
|
||||||
return strings;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void getStrings(File file, List<String> strings) throws IOException {
|
|
||||||
BufferedReader input = new BufferedReader(new InputStreamReader(new FileInputStream(file), "UTF-8"));
|
|
||||||
String line;
|
|
||||||
while ((line = input.readLine()) != null) {
|
|
||||||
strings.add(line.trim().intern());
|
|
||||||
}
|
|
||||||
input.close();
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,53 +0,0 @@
|
|||||||
package edu.mit.simile.vicino;
|
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
|
||||||
import java.io.InputStreamReader;
|
|
||||||
import java.io.Serializable;
|
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import edu.mit.simile.vicino.distances.Distance;
|
|
||||||
import edu.mit.simile.vicino.vptree.VPTree;
|
|
||||||
import edu.mit.simile.vicino.vptree.VPTreeBuilder;
|
|
||||||
import edu.mit.simile.vicino.vptree.VPTreeSeeker;
|
|
||||||
|
|
||||||
public class Seeker extends Operator {
|
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
|
||||||
Distance d = getDistance(args[0]);
|
|
||||||
|
|
||||||
log("Working with distance: " + d);
|
|
||||||
List<String> strings = getStrings(args[1]);
|
|
||||||
log("Obtained " + strings.size() + " from " + args[1]);
|
|
||||||
|
|
||||||
log("Building VPTree...");
|
|
||||||
VPTreeBuilder builder = new VPTreeBuilder(d);
|
|
||||||
VPTree tree = builder.buildVPTree(strings);
|
|
||||||
log("..done");
|
|
||||||
|
|
||||||
VPTreeSeeker seeker = new VPTreeSeeker(d, tree);
|
|
||||||
|
|
||||||
log("type a string|range then hit return:");
|
|
||||||
BufferedReader input = new BufferedReader(new InputStreamReader(System.in));
|
|
||||||
String line = null;
|
|
||||||
while ((line = input.readLine()) != null) {
|
|
||||||
int index = line.indexOf('|');
|
|
||||||
String query = line.substring(0, index);
|
|
||||||
float range = Float.parseFloat(line.substring(index + 1));
|
|
||||||
long start = System.currentTimeMillis();
|
|
||||||
Set<Serializable> results = seeker.range(query, range);
|
|
||||||
long stop = System.currentTimeMillis();
|
|
||||||
Iterator<Serializable> j = results.iterator();
|
|
||||||
if (j.hasNext()) {
|
|
||||||
while (j.hasNext()) {
|
|
||||||
String r = (String) j.next();
|
|
||||||
log(" " + r);
|
|
||||||
}
|
|
||||||
log(" [done in " + (stop - start) + "ms]");
|
|
||||||
} else {
|
|
||||||
log(" [no results found in " + (stop - start) + "ms]");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,46 +0,0 @@
|
|||||||
package edu.mit.simile.vicino;
|
|
||||||
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import edu.mit.simile.vicino.distances.Distance;
|
|
||||||
|
|
||||||
public class Tester extends Operator {
|
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
|
||||||
Distance d = getDistance(args[0]);
|
|
||||||
|
|
||||||
List<String> strings = getStrings(args[1]);
|
|
||||||
|
|
||||||
long start = System.currentTimeMillis();
|
|
||||||
|
|
||||||
int size = strings.size();
|
|
||||||
for (int i = 0; i < size * size * size; i++) {
|
|
||||||
String x = (String) strings.get((int) (Math.random() * size));
|
|
||||||
String y = (String) strings.get((int) (Math.random() * size));
|
|
||||||
String z = (String) strings.get((int) (Math.random() * size));
|
|
||||||
boolean metric = metric(x, y, z, d);
|
|
||||||
if (metric) {
|
|
||||||
System.out.println("metric");
|
|
||||||
} else {
|
|
||||||
System.out.println("***** NOT METRIC *****");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
long stop = System.currentTimeMillis();
|
|
||||||
float m = ((float) (stop - start)) / (float) size;
|
|
||||||
|
|
||||||
System.out.println("\n Each metric evaluation took: " + m + " millis");
|
|
||||||
}
|
|
||||||
|
|
||||||
static boolean metric(String x, String y, String z, Distance d) {
|
|
||||||
double dxx = d.d(x, x);
|
|
||||||
boolean identity = (dxx == 0.0f);
|
|
||||||
double dxy = d.d(x, y);
|
|
||||||
double dyx = d.d(y, x);
|
|
||||||
boolean simmetrical = (dxy == dyx);
|
|
||||||
double dxz = d.d(x, z);
|
|
||||||
double dyz = d.d(y, z);
|
|
||||||
boolean triangular = (dxz <= dxy + dyz);
|
|
||||||
return (identity && simmetrical && triangular);
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,20 +0,0 @@
|
|||||||
package edu.mit.simile.vicino.clustering;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
|
||||||
import java.util.Comparator;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
public abstract class Clusterer {
|
|
||||||
|
|
||||||
public class SizeComparator implements Comparator<Set<Serializable>> {
|
|
||||||
public int compare(Set<Serializable> o1, Set<Serializable> o2) {
|
|
||||||
return o2.size() - o1.size();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public abstract void populate(String s);
|
|
||||||
|
|
||||||
public abstract List<Set<Serializable>> getClusters(double radius);
|
|
||||||
|
|
||||||
}
|
|
@ -1,194 +0,0 @@
|
|||||||
package edu.mit.simile.vicino.clustering;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.Set;
|
|
||||||
import java.util.TreeSet;
|
|
||||||
import java.util.Map.Entry;
|
|
||||||
import java.util.concurrent.Callable;
|
|
||||||
import java.util.concurrent.ExecutionException;
|
|
||||||
import java.util.concurrent.ExecutorService;
|
|
||||||
import java.util.concurrent.Executors;
|
|
||||||
import java.util.concurrent.Future;
|
|
||||||
|
|
||||||
import com.wcohen.ss.api.Token;
|
|
||||||
|
|
||||||
import edu.mit.simile.vicino.NGramTokenizer;
|
|
||||||
import edu.mit.simile.vicino.distances.Distance;
|
|
||||||
|
|
||||||
public class NGramClusterer extends Clusterer {
|
|
||||||
|
|
||||||
NGramTokenizer _tokenizer;
|
|
||||||
Distance _distance;
|
|
||||||
|
|
||||||
Map<String,Set<String>> blocks = new HashMap<String,Set<String>>();
|
|
||||||
|
|
||||||
public NGramClusterer(Distance d, int blockSize) {
|
|
||||||
_tokenizer = new NGramTokenizer(blockSize);
|
|
||||||
_distance = d;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void populate(String s) {
|
|
||||||
Token[] tokens = _tokenizer.tokenize(s);
|
|
||||||
for (Token t : tokens) {
|
|
||||||
String ss = t.getValue();
|
|
||||||
Set<String> l = null;
|
|
||||||
if (!blocks.containsKey(ss)) {
|
|
||||||
l = new TreeSet<String>();
|
|
||||||
blocks.put(ss, l);
|
|
||||||
} else {
|
|
||||||
l = blocks.get(ss);
|
|
||||||
}
|
|
||||||
l.add(s);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public class BlockEvaluator implements Callable<Map<Serializable,Set<Serializable>>> {
|
|
||||||
|
|
||||||
int start;
|
|
||||||
int stop;
|
|
||||||
double radius;
|
|
||||||
|
|
||||||
List<Set<String>> blocks;
|
|
||||||
Map<Serializable,Set<Serializable>> cluster_map;
|
|
||||||
|
|
||||||
public BlockEvaluator(List<Set<String>> blocks, double radius, int start, int stop) {
|
|
||||||
this.blocks = blocks;
|
|
||||||
this.start = start;
|
|
||||||
this.stop = stop;
|
|
||||||
this.radius = radius;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Map<Serializable,Set<Serializable>> call() {
|
|
||||||
Map<Serializable,Set<Serializable>> cluster_map = new HashMap<Serializable,Set<Serializable>>();
|
|
||||||
|
|
||||||
for (int i = start; i < stop; i++) {
|
|
||||||
Set<String> set = blocks.get(i);
|
|
||||||
if (set.size() < 2) continue;
|
|
||||||
for (String a : set) {
|
|
||||||
for (String b : set) {
|
|
||||||
if (a == b) continue;
|
|
||||||
if (cluster_map.containsKey(a) && cluster_map.get(a).contains(b)) continue;
|
|
||||||
if (cluster_map.containsKey(b) && cluster_map.get(b).contains(a)) continue;
|
|
||||||
double d = _distance.d(a,b);
|
|
||||||
if (d <= radius || radius < 0) {
|
|
||||||
Set<Serializable> l = null;
|
|
||||||
if (!cluster_map.containsKey(a)) {
|
|
||||||
l = new TreeSet<Serializable>();
|
|
||||||
l.add(a);
|
|
||||||
cluster_map.put(a, l);
|
|
||||||
} else {
|
|
||||||
l = cluster_map.get(a);
|
|
||||||
}
|
|
||||||
l.add(b);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return cluster_map;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private static final ExecutorService executor = Executors.newCachedThreadPool();
|
|
||||||
|
|
||||||
private static final boolean MULTITHREADED = true;
|
|
||||||
|
|
||||||
public List<Set<Serializable>> getClusters(double radius) {
|
|
||||||
if (MULTITHREADED) {
|
|
||||||
return getClustersMultiThread(radius);
|
|
||||||
} else {
|
|
||||||
return getClustersSingleThread(radius);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public List<Set<Serializable>> getClustersMultiThread(double radius) {
|
|
||||||
|
|
||||||
int cores = Runtime.getRuntime().availableProcessors();
|
|
||||||
int size = blocks.size();
|
|
||||||
int range = size / cores + 1;
|
|
||||||
|
|
||||||
List<Map<Serializable,Set<Serializable>>> cluster_maps = new ArrayList<Map<Serializable,Set<Serializable>>>(cores);
|
|
||||||
|
|
||||||
List<BlockEvaluator> evaluators = new ArrayList<BlockEvaluator>(cores);
|
|
||||||
for (int i = 0; i < cores; i++) {
|
|
||||||
int range_start = range * i;
|
|
||||||
int range_end = range * (i + 1);
|
|
||||||
if (range_end > size) range_end = size;
|
|
||||||
evaluators.add(new BlockEvaluator(new ArrayList<Set<String>>(blocks.values()),radius,range_start,range_end));
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
List<Future<Map<Serializable,Set<Serializable>>>> futures = executor.invokeAll(evaluators);
|
|
||||||
for (Future<Map<Serializable,Set<Serializable>>> future : futures) {
|
|
||||||
cluster_maps.add(future.get());
|
|
||||||
}
|
|
||||||
} catch (InterruptedException e1) {
|
|
||||||
e1.printStackTrace();
|
|
||||||
} catch (ExecutionException e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
}
|
|
||||||
|
|
||||||
Set<Set<Serializable>> clusters = new HashSet<Set<Serializable>>();
|
|
||||||
|
|
||||||
for (Map<Serializable,Set<Serializable>> cluster_map : cluster_maps) {
|
|
||||||
for (Entry<Serializable,Set<Serializable>> e : cluster_map.entrySet()) {
|
|
||||||
Set<Serializable> v = e.getValue();
|
|
||||||
if (v.size() > 1) {
|
|
||||||
clusters.add(v);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
List<Set<Serializable>> sorted_clusters = new ArrayList<Set<Serializable>>(clusters);
|
|
||||||
|
|
||||||
Collections.sort(sorted_clusters, new SizeComparator());
|
|
||||||
|
|
||||||
return sorted_clusters;
|
|
||||||
}
|
|
||||||
|
|
||||||
public List<Set<Serializable>> getClustersSingleThread(double radius) {
|
|
||||||
|
|
||||||
Map<Serializable,Set<Serializable>> cluster_map = new HashMap<Serializable,Set<Serializable>>();
|
|
||||||
|
|
||||||
for (Set<String> set : blocks.values()) {
|
|
||||||
if (set.size() < 2) continue;
|
|
||||||
for (String a : set) {
|
|
||||||
for (String b : set) {
|
|
||||||
if (a == b) continue;
|
|
||||||
if (cluster_map.containsKey(a) && cluster_map.get(a).contains(b)) continue;
|
|
||||||
if (cluster_map.containsKey(b) && cluster_map.get(b).contains(a)) continue;
|
|
||||||
double d = _distance.d(a,b);
|
|
||||||
if (d <= radius || radius < 0) {
|
|
||||||
Set<Serializable> l = null;
|
|
||||||
if (!cluster_map.containsKey(a)) {
|
|
||||||
l = new TreeSet<Serializable>();
|
|
||||||
l.add(a);
|
|
||||||
cluster_map.put(a, l);
|
|
||||||
} else {
|
|
||||||
l = cluster_map.get(a);
|
|
||||||
}
|
|
||||||
l.add(b);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
List<Set<Serializable>> clusters = new ArrayList<Set<Serializable>>();
|
|
||||||
for (Entry<Serializable,Set<Serializable>> e : cluster_map.entrySet()) {
|
|
||||||
Set<Serializable> v = e.getValue();
|
|
||||||
if (v.size() > 1) {
|
|
||||||
clusters.add(v);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Collections.sort(clusters, new SizeComparator());
|
|
||||||
|
|
||||||
return clusters;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,63 +0,0 @@
|
|||||||
package edu.mit.simile.vicino.clustering;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import edu.mit.simile.vicino.distances.Distance;
|
|
||||||
import edu.mit.simile.vicino.vptree.Node;
|
|
||||||
import edu.mit.simile.vicino.vptree.VPTree;
|
|
||||||
import edu.mit.simile.vicino.vptree.VPTreeBuilder;
|
|
||||||
import edu.mit.simile.vicino.vptree.VPTreeSeeker;
|
|
||||||
|
|
||||||
public class VPTreeClusterer extends Clusterer {
|
|
||||||
|
|
||||||
VPTreeBuilder _treeBuilder;
|
|
||||||
Distance _distance;
|
|
||||||
|
|
||||||
public VPTreeClusterer(Distance d) {
|
|
||||||
_distance = d;
|
|
||||||
_treeBuilder = new VPTreeBuilder(d);
|
|
||||||
}
|
|
||||||
|
|
||||||
public void populate(String s) {
|
|
||||||
_treeBuilder.populate(s);
|
|
||||||
}
|
|
||||||
|
|
||||||
public List<Set<Serializable>> getClusters(double radius) {
|
|
||||||
VPTree tree = _treeBuilder.buildVPTree();
|
|
||||||
System.out.println("distances after the tree: " + _distance.getCount());
|
|
||||||
Set<Node> nodes = _treeBuilder.getNodes();
|
|
||||||
|
|
||||||
VPTreeSeeker seeker = new VPTreeSeeker(_distance,tree);
|
|
||||||
Map<Serializable,Boolean> flags = new HashMap<Serializable,Boolean>();
|
|
||||||
for (Node n : nodes) {
|
|
||||||
flags.put(n.get(), true);
|
|
||||||
}
|
|
||||||
|
|
||||||
Map<Serializable,Set<Serializable>> map = new HashMap<Serializable,Set<Serializable>>();
|
|
||||||
for (Node n : nodes) {
|
|
||||||
Serializable s = n.get();
|
|
||||||
if (flags.get(s)) {
|
|
||||||
Set<Serializable> results = seeker.range(s, radius);
|
|
||||||
for (Serializable ss : results) {
|
|
||||||
flags.put(ss, false);
|
|
||||||
}
|
|
||||||
if (results.size() > 1) {
|
|
||||||
map.put(s, results);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
List<Set<Serializable>> clusters = new ArrayList<Set<Serializable>>(map.values());
|
|
||||||
Collections.sort(clusters, new SizeComparator());
|
|
||||||
|
|
||||||
return clusters;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
}
|
|
@ -1,26 +0,0 @@
|
|||||||
package edu.mit.simile.vicino.distances;
|
|
||||||
|
|
||||||
import java.io.ByteArrayOutputStream;
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
import org.apache.tools.bzip2.CBZip2OutputStream;
|
|
||||||
|
|
||||||
public class BZip2Distance extends PseudoMetricDistance {
|
|
||||||
|
|
||||||
public double d2(String x, String y) {
|
|
||||||
String str = x + y;
|
|
||||||
double result = 0.0f;
|
|
||||||
try {
|
|
||||||
ByteArrayOutputStream baos = new ByteArrayOutputStream(str.length());
|
|
||||||
CBZip2OutputStream os = new CBZip2OutputStream(baos);
|
|
||||||
os.write(str.getBytes());
|
|
||||||
os.close();
|
|
||||||
baos.close();
|
|
||||||
result = baos.toByteArray().length;
|
|
||||||
} catch (IOException e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,17 +0,0 @@
|
|||||||
package edu.mit.simile.vicino.distances;
|
|
||||||
|
|
||||||
public abstract class Distance {
|
|
||||||
|
|
||||||
int counter = 0;
|
|
||||||
|
|
||||||
public int getCount() {
|
|
||||||
return counter;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void resetCounter() {
|
|
||||||
counter = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
public abstract double d(String x, String y);
|
|
||||||
|
|
||||||
}
|
|
@ -1,26 +0,0 @@
|
|||||||
package edu.mit.simile.vicino.distances;
|
|
||||||
|
|
||||||
import java.io.ByteArrayOutputStream;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.zip.GZIPOutputStream;
|
|
||||||
|
|
||||||
public class GZipDistance extends PseudoMetricDistance {
|
|
||||||
|
|
||||||
public double d2(String x, String y) {
|
|
||||||
String str = x + y;
|
|
||||||
double result = 0.0f;
|
|
||||||
try {
|
|
||||||
ByteArrayOutputStream baos = new ByteArrayOutputStream(str.length());
|
|
||||||
GZIPOutputStream os = new GZIPOutputStream(baos);
|
|
||||||
os.write(str.getBytes());
|
|
||||||
os.close();
|
|
||||||
baos.close();
|
|
||||||
result = baos.toByteArray().length;
|
|
||||||
} catch (IOException e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,18 +0,0 @@
|
|||||||
package edu.mit.simile.vicino.distances;
|
|
||||||
|
|
||||||
import com.wcohen.ss.Jaccard;
|
|
||||||
import com.wcohen.ss.api.StringDistance;
|
|
||||||
|
|
||||||
public class JaccardDistance extends MetricDistance {
|
|
||||||
|
|
||||||
StringDistance distance;
|
|
||||||
|
|
||||||
public JaccardDistance() {
|
|
||||||
this.distance = new Jaccard();
|
|
||||||
}
|
|
||||||
|
|
||||||
protected double d2(String x, String y) {
|
|
||||||
return this.distance.score(x, y);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,18 +0,0 @@
|
|||||||
package edu.mit.simile.vicino.distances;
|
|
||||||
|
|
||||||
import com.wcohen.ss.Jaro;
|
|
||||||
import com.wcohen.ss.api.StringDistance;
|
|
||||||
|
|
||||||
public class JaroDistance extends MetricDistance {
|
|
||||||
|
|
||||||
StringDistance distance;
|
|
||||||
|
|
||||||
public JaroDistance() {
|
|
||||||
this.distance = new Jaro();
|
|
||||||
}
|
|
||||||
|
|
||||||
protected double d2(String x, String y) {
|
|
||||||
return this.distance.score(x, y);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,18 +0,0 @@
|
|||||||
package edu.mit.simile.vicino.distances;
|
|
||||||
|
|
||||||
import com.wcohen.ss.JaroWinkler;
|
|
||||||
import com.wcohen.ss.api.StringDistance;
|
|
||||||
|
|
||||||
public class JaroWinklerDistance extends MetricDistance {
|
|
||||||
|
|
||||||
StringDistance distance;
|
|
||||||
|
|
||||||
public JaroWinklerDistance() {
|
|
||||||
this.distance = new JaroWinkler();
|
|
||||||
}
|
|
||||||
|
|
||||||
protected double d2(String x, String y) {
|
|
||||||
return this.distance.score(x, y);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,18 +0,0 @@
|
|||||||
package edu.mit.simile.vicino.distances;
|
|
||||||
|
|
||||||
import com.wcohen.ss.JaroWinklerTFIDF;
|
|
||||||
import com.wcohen.ss.api.StringDistance;
|
|
||||||
|
|
||||||
public class JaroWinklerTFIDFDistance extends MetricDistance {
|
|
||||||
|
|
||||||
StringDistance distance;
|
|
||||||
|
|
||||||
public JaroWinklerTFIDFDistance() {
|
|
||||||
this.distance = new JaroWinklerTFIDF();
|
|
||||||
}
|
|
||||||
|
|
||||||
protected double d2(String x, String y) {
|
|
||||||
return this.distance.score(x, y);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,18 +0,0 @@
|
|||||||
package edu.mit.simile.vicino.distances;
|
|
||||||
|
|
||||||
import com.wcohen.ss.Levenstein;
|
|
||||||
import com.wcohen.ss.api.StringDistance;
|
|
||||||
|
|
||||||
public class LevenshteinDistance extends MetricDistance {
|
|
||||||
|
|
||||||
StringDistance distance;
|
|
||||||
|
|
||||||
public LevenshteinDistance() {
|
|
||||||
this.distance = new Levenstein();
|
|
||||||
}
|
|
||||||
|
|
||||||
public double d2(String x, String y) {
|
|
||||||
return Math.abs(this.distance.score(x, y));
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,24 +0,0 @@
|
|||||||
package edu.mit.simile.vicino.distances;
|
|
||||||
|
|
||||||
|
|
||||||
public abstract class MetricDistance extends Distance {
|
|
||||||
|
|
||||||
/*
|
|
||||||
* public float d(String x,String y) {
|
|
||||||
* float dxy = d2(x,y);
|
|
||||||
* float dx = d2(x,"");
|
|
||||||
* float dy = d2(y,"");
|
|
||||||
* float result = dxy / (dx + dy);
|
|
||||||
* return result;
|
|
||||||
* }
|
|
||||||
*/
|
|
||||||
|
|
||||||
public double d(String x, String y) {
|
|
||||||
double result = d2(x, y);
|
|
||||||
counter += 1;
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
abstract double d2(String x, String y);
|
|
||||||
|
|
||||||
}
|
|
@ -1,27 +0,0 @@
|
|||||||
package edu.mit.simile.vicino.distances;
|
|
||||||
|
|
||||||
import java.io.ByteArrayOutputStream;
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
import com.colloquial.arithcode.ArithCodeOutputStream;
|
|
||||||
import com.colloquial.arithcode.PPMModel;
|
|
||||||
|
|
||||||
public class PPMDistance extends PseudoMetricDistance {
|
|
||||||
|
|
||||||
public double d2(String x, String y) {
|
|
||||||
String str = x + y;
|
|
||||||
double result = 0.0f;
|
|
||||||
try {
|
|
||||||
ByteArrayOutputStream baos = new ByteArrayOutputStream(str.length());
|
|
||||||
ArithCodeOutputStream os = new ArithCodeOutputStream(baos,new PPMModel(8));
|
|
||||||
os.write(str.getBytes());
|
|
||||||
os.close();
|
|
||||||
baos.close();
|
|
||||||
result = baos.toByteArray().length;
|
|
||||||
} catch (IOException e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,16 +0,0 @@
|
|||||||
package edu.mit.simile.vicino.distances;
|
|
||||||
|
|
||||||
|
|
||||||
public abstract class PseudoMetricDistance extends Distance {
|
|
||||||
|
|
||||||
public double d(String x, String y) {
|
|
||||||
double cxx = d2(x, x);
|
|
||||||
double cyy = d2(y, y);
|
|
||||||
double cxy = d2(x, y);
|
|
||||||
double cyx = d2(y, x);
|
|
||||||
counter += 4;
|
|
||||||
return 10.0d * ((cxy + cyx) / (cxx + cyy) - 1.0d);
|
|
||||||
}
|
|
||||||
|
|
||||||
protected abstract double d2(String x, String y);
|
|
||||||
}
|
|
@ -1,58 +0,0 @@
|
|||||||
package edu.mit.simile.vicino.vptree;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This class represent a couple (Object, distance) value of that Object from
|
|
||||||
* the Vp in each step of the algorithm.
|
|
||||||
*
|
|
||||||
* @author Paolo Ciccarese
|
|
||||||
*/
|
|
||||||
public class Node implements Serializable {
|
|
||||||
|
|
||||||
private static final long serialVersionUID = -2077473220894258550L;
|
|
||||||
|
|
||||||
private final Serializable obj;
|
|
||||||
private double distance;
|
|
||||||
|
|
||||||
public Node(Serializable obj, int i) {
|
|
||||||
this.obj = obj;
|
|
||||||
this.distance = i;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Node(Serializable obj) {
|
|
||||||
this.obj = obj;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Serializable get() {
|
|
||||||
return this.obj;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setDistance(double distance) {
|
|
||||||
this.distance = distance;
|
|
||||||
}
|
|
||||||
|
|
||||||
public double getDistance() {
|
|
||||||
return distance;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String toString() {
|
|
||||||
return obj.toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean equals(Object o) {
|
|
||||||
if (this == o) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
if (o instanceof Node) {
|
|
||||||
return ((Node) o).get().equals(this.obj);
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int hashCode() {
|
|
||||||
return this.obj.hashCode();
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,94 +0,0 @@
|
|||||||
package edu.mit.simile.vicino.vptree;
|
|
||||||
|
|
||||||
public class NodeSorter {
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Sorts and array of objects.
|
|
||||||
*/
|
|
||||||
public void sort(Node nodes[]) {
|
|
||||||
NodeSorter.sort(nodes, 0, nodes.length - 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Sort array of Objects using the QuickSort algorithm.
|
|
||||||
*
|
|
||||||
* @param s
|
|
||||||
* An Object[].
|
|
||||||
* @param lo
|
|
||||||
* The current lower bound.
|
|
||||||
* @param hi
|
|
||||||
* The current upper bound.
|
|
||||||
*/
|
|
||||||
public static void sort(Node nodes[], int lo, int hi) {
|
|
||||||
if (lo >= hi) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Use median-of-three(lo, mid, hi) to pick a partition. Also swap them
|
|
||||||
* into relative order while we are at it.
|
|
||||||
*/
|
|
||||||
int mid = (lo + hi) / 2;
|
|
||||||
|
|
||||||
if (nodes[lo].getDistance() > nodes[mid].getDistance()) {
|
|
||||||
// Swap.
|
|
||||||
Node tmp = nodes[lo];
|
|
||||||
nodes[lo] = nodes[mid];
|
|
||||||
nodes[mid] = tmp;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (nodes[mid].getDistance() > nodes[hi].getDistance()) {
|
|
||||||
// Swap .
|
|
||||||
Node tmp = nodes[mid];
|
|
||||||
nodes[mid] = nodes[hi];
|
|
||||||
nodes[hi] = tmp;
|
|
||||||
|
|
||||||
if (nodes[lo].getDistance() > nodes[mid].getDistance()) {
|
|
||||||
// Swap.
|
|
||||||
Node tmp2 = nodes[lo];
|
|
||||||
nodes[lo] = nodes[mid];
|
|
||||||
nodes[mid] = tmp2;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Start one past lo since already handled lo.
|
|
||||||
|
|
||||||
int left = lo + 1;
|
|
||||||
|
|
||||||
// Similarly, end one before hi since already handled hi.
|
|
||||||
|
|
||||||
int right = hi - 1;
|
|
||||||
|
|
||||||
// If there are three or fewer elements, we are done.
|
|
||||||
|
|
||||||
if (left >= right) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
Node partition = nodes[mid];
|
|
||||||
|
|
||||||
while (true) {
|
|
||||||
while (nodes[right].getDistance() > partition.getDistance()) {
|
|
||||||
--right;
|
|
||||||
}
|
|
||||||
|
|
||||||
while (left < right && nodes[left].getDistance() <= partition.getDistance()) {
|
|
||||||
++left;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (left < right) {
|
|
||||||
// Swap.
|
|
||||||
Node tmp = nodes[left];
|
|
||||||
nodes[left] = nodes[right];
|
|
||||||
nodes[right] = tmp;
|
|
||||||
|
|
||||||
--right;
|
|
||||||
} else {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
sort(nodes, lo, left);
|
|
||||||
sort(nodes, left + 1, hi);
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,56 +0,0 @@
|
|||||||
package edu.mit.simile.vicino.vptree;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @author Paolo Ciccarese
|
|
||||||
*/
|
|
||||||
public class TNode implements Serializable {
|
|
||||||
|
|
||||||
private static final long serialVersionUID = -217604190976851241L;
|
|
||||||
|
|
||||||
private final Serializable obj;
|
|
||||||
private double median;
|
|
||||||
private TNode left;
|
|
||||||
private TNode right;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The Object will be fixed during the instantiation of the node, while the
|
|
||||||
* children will be defined in another iteration of the algorithm,
|
|
||||||
*/
|
|
||||||
public TNode(Serializable obj) {
|
|
||||||
this.obj = obj;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Serializable get() {
|
|
||||||
return this.obj;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setMedian(double median) {
|
|
||||||
this.median = median;
|
|
||||||
}
|
|
||||||
|
|
||||||
public double getMedian() {
|
|
||||||
return median;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setLeft(TNode leftNode) {
|
|
||||||
this.left = leftNode;
|
|
||||||
}
|
|
||||||
|
|
||||||
public TNode getLeft() {
|
|
||||||
return left;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setRight(TNode rightNode) {
|
|
||||||
this.right = rightNode;
|
|
||||||
}
|
|
||||||
|
|
||||||
public TNode getRight() {
|
|
||||||
return right;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String toString() {
|
|
||||||
return this.obj.toString();
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,33 +0,0 @@
|
|||||||
package edu.mit.simile.vicino.vptree;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The VPTree class.
|
|
||||||
*
|
|
||||||
* @author Paolo Ciccarese
|
|
||||||
*/
|
|
||||||
public class VPTree implements Serializable {
|
|
||||||
|
|
||||||
private static final long serialVersionUID = 1291056732155841123L;
|
|
||||||
|
|
||||||
private TNode root;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Sets the root of the VPTree.
|
|
||||||
*
|
|
||||||
* @param root The VPTree root.
|
|
||||||
*/
|
|
||||||
public void setRoot(TNode root) {
|
|
||||||
this.root = root;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Get the root of the VPTree.
|
|
||||||
*
|
|
||||||
* @return The VPTree root.
|
|
||||||
*/
|
|
||||||
public TNode getRoot() {
|
|
||||||
return root;
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,192 +0,0 @@
|
|||||||
package edu.mit.simile.vicino.vptree;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
|
||||||
import java.util.Collection;
|
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Random;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import edu.mit.simile.vicino.distances.Distance;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @author Paolo Ciccarese
|
|
||||||
* @author Stefano Mazzocchi
|
|
||||||
*/
|
|
||||||
public class VPTreeBuilder {
|
|
||||||
|
|
||||||
private static final boolean DEBUG = false;
|
|
||||||
private static final boolean OPTIMIZED = false;
|
|
||||||
private static final int sample_size = 10;
|
|
||||||
|
|
||||||
private Random generator = new Random(System.currentTimeMillis());
|
|
||||||
|
|
||||||
private final Distance distance;
|
|
||||||
|
|
||||||
private Set<Node> nodes = new HashSet<Node>();
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Defines a VPTree Builder for a specific distance.
|
|
||||||
*
|
|
||||||
* @param distance
|
|
||||||
* The class implementing the distance.
|
|
||||||
*/
|
|
||||||
public VPTreeBuilder(Distance distance) {
|
|
||||||
this.distance = distance;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Set<Node> getNodes() {
|
|
||||||
return this.nodes;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void populate(Serializable s) {
|
|
||||||
nodes.add(new Node(s));
|
|
||||||
}
|
|
||||||
|
|
||||||
public VPTree buildVPTree() {
|
|
||||||
if (DEBUG) {
|
|
||||||
for (Node n : this.nodes) {
|
|
||||||
System.out.println(n.get().toString());
|
|
||||||
}
|
|
||||||
System.out.println();
|
|
||||||
}
|
|
||||||
Node[] nodes_array = this.nodes.toArray(new Node[this.nodes.size()]);
|
|
||||||
VPTree tree = new VPTree();
|
|
||||||
if (nodes_array.length > 0) {
|
|
||||||
tree.setRoot(makeNode(nodes_array, 0, nodes_array.length - 1));
|
|
||||||
}
|
|
||||||
return tree;
|
|
||||||
}
|
|
||||||
|
|
||||||
public VPTree buildVPTree(Collection<? extends Serializable> values) {
|
|
||||||
reset();
|
|
||||||
for (Serializable s : values) {
|
|
||||||
populate(s);
|
|
||||||
}
|
|
||||||
return buildVPTree();
|
|
||||||
}
|
|
||||||
|
|
||||||
public void reset() {
|
|
||||||
this.nodes.clear();
|
|
||||||
}
|
|
||||||
|
|
||||||
private TNode makeNode(Node nodes[], int begin, int end) {
|
|
||||||
|
|
||||||
int delta = end - begin;
|
|
||||||
|
|
||||||
if (DEBUG) System.out.println("\ndelta: " + delta);
|
|
||||||
|
|
||||||
if (delta == 0) {
|
|
||||||
TNode vpNode = new TNode(nodes[begin].get());
|
|
||||||
vpNode.setMedian(0);
|
|
||||||
return vpNode;
|
|
||||||
} else if (delta < 0) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
Node randomNode = getVantagePoint(nodes, begin, end);
|
|
||||||
TNode vpNode = new TNode(randomNode.get());
|
|
||||||
|
|
||||||
if (DEBUG) System.out.println("\nvp-node: " + vpNode.get().toString());
|
|
||||||
|
|
||||||
calculateDistances(vpNode, nodes, begin, end);
|
|
||||||
orderDistances(nodes, begin, end);
|
|
||||||
fixVantagPoint(randomNode, nodes, begin, end);
|
|
||||||
|
|
||||||
if (DEBUG) {
|
|
||||||
for (int i = begin; i <= end; i++) {
|
|
||||||
System.out.println(" +-- " + nodes[i].getDistance() + " --> " + nodes[i].get());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
float median = (float) median(nodes, begin, end);
|
|
||||||
vpNode.setMedian(median);
|
|
||||||
|
|
||||||
int i = 0;
|
|
||||||
for (i = begin + 1; i < end; i++) {
|
|
||||||
if (nodes[i].getDistance() >= median) {
|
|
||||||
vpNode.setLeft(makeNode(nodes, begin + 1, i - 1));
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
vpNode.setRight(makeNode(nodes, i, end));
|
|
||||||
|
|
||||||
return vpNode;
|
|
||||||
}
|
|
||||||
|
|
||||||
private Node getVantagePoint(Node nodes[], int begin, int end) {
|
|
||||||
if (OPTIMIZED) {
|
|
||||||
Node buffer[] = new Node[sample_size];
|
|
||||||
for (int i = 0; i < sample_size; i++) {
|
|
||||||
buffer[i] = getRandomNode(nodes,begin,end);
|
|
||||||
}
|
|
||||||
|
|
||||||
double bestSpread = 0;
|
|
||||||
Node bestNode = buffer[0];
|
|
||||||
for (int i = 0; i < sample_size; i++) {
|
|
||||||
calculateDistances(new TNode(buffer[i]), buffer, 0, buffer.length - 1);
|
|
||||||
orderDistances(nodes, begin, end);
|
|
||||||
double median = (double) median(nodes, begin, end);
|
|
||||||
double spread = deviation(buffer, median);
|
|
||||||
System.out.println(" " + spread);
|
|
||||||
if (spread > bestSpread) {
|
|
||||||
bestSpread = spread;
|
|
||||||
bestNode = buffer[i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
System.out.println("best: " + bestSpread);
|
|
||||||
return bestNode;
|
|
||||||
} else {
|
|
||||||
return getRandomNode(nodes,begin,end);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private Node getRandomNode(Node nodes[], int begin, int end) {
|
|
||||||
return nodes[begin + generator.nextInt(end - begin)];
|
|
||||||
}
|
|
||||||
|
|
||||||
private double deviation(Node buffer[], double median) {
|
|
||||||
double sum = 0;
|
|
||||||
for (int i = 0; i < buffer.length; i++) {
|
|
||||||
sum += Math.pow(buffer[i].getDistance() - median, 2);
|
|
||||||
}
|
|
||||||
return sum / buffer.length;
|
|
||||||
}
|
|
||||||
|
|
||||||
public double median(Node nodes[], int begin, int end) {
|
|
||||||
int delta = end - begin;
|
|
||||||
int middle = delta / 2;
|
|
||||||
|
|
||||||
if (delta % 2 == 0) {
|
|
||||||
return nodes[begin + middle].getDistance();
|
|
||||||
} else {
|
|
||||||
return (nodes[begin + middle].getDistance() + nodes[begin + middle + 1].getDistance()) / 2.0d;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void calculateDistances(TNode pivot, Node nodes[], int begin, int end) {
|
|
||||||
Serializable x = pivot.get();
|
|
||||||
for (int i = begin; i <= end; i++) {
|
|
||||||
Serializable y = nodes[i].get();
|
|
||||||
double d = (x == y || x.equals(y)) ? 0.0d : distance.d(x.toString(), y.toString());
|
|
||||||
nodes[i].setDistance(d);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void fixVantagPoint(Node pivot, Node nodes[], int begin, int end) {
|
|
||||||
for (int i = begin; i < end; i++) {
|
|
||||||
if (nodes[i] == pivot) {
|
|
||||||
if (i > begin) {
|
|
||||||
Node tmp = nodes[begin];
|
|
||||||
nodes[begin] = pivot;
|
|
||||||
nodes[i] = tmp;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void orderDistances(Node nodes[], int begin, int end) {
|
|
||||||
NodeSorter.sort(nodes, begin, end);
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,59 +0,0 @@
|
|||||||
package edu.mit.simile.vicino.vptree;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import edu.mit.simile.vicino.distances.Distance;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @author Paolo Ciccarese
|
|
||||||
*/
|
|
||||||
public class VPTreeSeeker {
|
|
||||||
|
|
||||||
private static final boolean DEBUG = false;
|
|
||||||
|
|
||||||
VPTree tree;
|
|
||||||
Distance distance;
|
|
||||||
|
|
||||||
public VPTreeSeeker(Distance distance, VPTree tree) {
|
|
||||||
this.distance = distance;
|
|
||||||
this.tree = tree;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Set<Serializable> range(Serializable query, double range) {
|
|
||||||
if (DEBUG) System.out.println("--------------- " + query + " " + range);
|
|
||||||
return rangeTraversal(query, range, tree.getRoot(), new HashSet<Serializable>());
|
|
||||||
}
|
|
||||||
|
|
||||||
private Set<Serializable> rangeTraversal(Serializable query, double range, TNode tNode, Set<Serializable> results) {
|
|
||||||
|
|
||||||
if (DEBUG) System.out.println("> " + tNode);
|
|
||||||
|
|
||||||
if (tNode != null) {
|
|
||||||
double distance = this.distance.d(query.toString(), tNode.get().toString());
|
|
||||||
|
|
||||||
if (distance <= range) {
|
|
||||||
if (DEBUG) System.out.println("*** add ***");
|
|
||||||
results.add(tNode.get());
|
|
||||||
}
|
|
||||||
|
|
||||||
if ((distance + range) < tNode.getMedian()) {
|
|
||||||
if (DEBUG) System.out.println("left: " + distance + " + " + range + " < " + tNode.getMedian());
|
|
||||||
rangeTraversal(query, range, tNode.getLeft(), results);
|
|
||||||
} else if ((distance - range) > tNode.getMedian()) {
|
|
||||||
if (DEBUG) System.out.println("right: " + distance + " + " + range + " > " + tNode.getMedian());
|
|
||||||
rangeTraversal(query, range, tNode.getRight(), results);
|
|
||||||
} else {
|
|
||||||
if (DEBUG) System.out.println("left & right: " + distance + " + " + range + " = " + tNode.getMedian());
|
|
||||||
rangeTraversal(query, range, tNode.getLeft(), results);
|
|
||||||
rangeTraversal(query, range, tNode.getRight(), results);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (DEBUG) System.out.println("< " + tNode);
|
|
||||||
|
|
||||||
return results;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,136 +0,0 @@
|
|||||||
/*
|
|
||||||
* The Apache Software License, Version 1.1
|
|
||||||
*
|
|
||||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
|
||||||
* reserved.
|
|
||||||
*
|
|
||||||
* Redistribution and use in source and binary forms, with or without
|
|
||||||
* modification, are permitted provided that the following conditions
|
|
||||||
* are met:
|
|
||||||
*
|
|
||||||
* 1. Redistributions of source code must retain the above copyright
|
|
||||||
* notice, this list of conditions and the following disclaimer.
|
|
||||||
*
|
|
||||||
* 2. Redistributions in binary form must reproduce the above copyright
|
|
||||||
* notice, this list of conditions and the following disclaimer in
|
|
||||||
* the documentation and/or other materials provided with the
|
|
||||||
* distribution.
|
|
||||||
*
|
|
||||||
* 3. The end-user documentation included with the redistribution, if
|
|
||||||
* any, must include the following acknowlegement:
|
|
||||||
* "This product includes software developed by the
|
|
||||||
* Apache Software Foundation (http://www.apache.org/)."
|
|
||||||
* Alternately, this acknowlegement may appear in the software itself,
|
|
||||||
* if and wherever such third-party acknowlegements normally appear.
|
|
||||||
*
|
|
||||||
* 4. The names "Ant" and "Apache Software
|
|
||||||
* Foundation" must not be used to endorse or promote products derived
|
|
||||||
* from this software without prior written permission. For written
|
|
||||||
* permission, please contact apache@apache.org.
|
|
||||||
*
|
|
||||||
* 5. Products derived from this software may not be called "Apache"
|
|
||||||
* nor may "Apache" appear in their names without prior written
|
|
||||||
* permission of the Apache Group.
|
|
||||||
*
|
|
||||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
|
||||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
|
||||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
||||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
|
||||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
||||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
||||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
|
||||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
|
||||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
||||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
|
||||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
||||||
* SUCH DAMAGE.
|
|
||||||
* ====================================================================
|
|
||||||
*
|
|
||||||
* This software consists of voluntary contributions made by many
|
|
||||||
* individuals on behalf of the Apache Software Foundation. For more
|
|
||||||
* information on the Apache Software Foundation, please see
|
|
||||||
* <http://www.apache.org/>.
|
|
||||||
*/
|
|
||||||
|
|
||||||
/*
|
|
||||||
* This package is based on the work done by Keiron Liddle, Aftex Software
|
|
||||||
* <keiron@aftexsw.com> to whom the Ant project is very grateful for his
|
|
||||||
* great code.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package org.apache.tools.bzip2;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Base class for both the compress and decompress classes.
|
|
||||||
* Holds common arrays, and static data.
|
|
||||||
*
|
|
||||||
* @author <a href="mailto:keiron@aftexsw.com">Keiron Liddle</a>
|
|
||||||
*/
|
|
||||||
public interface BZip2Constants {
|
|
||||||
|
|
||||||
int baseBlockSize = 100000;
|
|
||||||
int MAX_ALPHA_SIZE = 258;
|
|
||||||
int MAX_CODE_LEN = 23;
|
|
||||||
int RUNA = 0;
|
|
||||||
int RUNB = 1;
|
|
||||||
int N_GROUPS = 6;
|
|
||||||
int G_SIZE = 50;
|
|
||||||
int N_ITERS = 4;
|
|
||||||
int MAX_SELECTORS = (2 + (900000 / G_SIZE));
|
|
||||||
int NUM_OVERSHOOT_BYTES = 20;
|
|
||||||
|
|
||||||
int[] rNums = {
|
|
||||||
619, 720, 127, 481, 931, 816, 813, 233, 566, 247,
|
|
||||||
985, 724, 205, 454, 863, 491, 741, 242, 949, 214,
|
|
||||||
733, 859, 335, 708, 621, 574, 73, 654, 730, 472,
|
|
||||||
419, 436, 278, 496, 867, 210, 399, 680, 480, 51,
|
|
||||||
878, 465, 811, 169, 869, 675, 611, 697, 867, 561,
|
|
||||||
862, 687, 507, 283, 482, 129, 807, 591, 733, 623,
|
|
||||||
150, 238, 59, 379, 684, 877, 625, 169, 643, 105,
|
|
||||||
170, 607, 520, 932, 727, 476, 693, 425, 174, 647,
|
|
||||||
73, 122, 335, 530, 442, 853, 695, 249, 445, 515,
|
|
||||||
909, 545, 703, 919, 874, 474, 882, 500, 594, 612,
|
|
||||||
641, 801, 220, 162, 819, 984, 589, 513, 495, 799,
|
|
||||||
161, 604, 958, 533, 221, 400, 386, 867, 600, 782,
|
|
||||||
382, 596, 414, 171, 516, 375, 682, 485, 911, 276,
|
|
||||||
98, 553, 163, 354, 666, 933, 424, 341, 533, 870,
|
|
||||||
227, 730, 475, 186, 263, 647, 537, 686, 600, 224,
|
|
||||||
469, 68, 770, 919, 190, 373, 294, 822, 808, 206,
|
|
||||||
184, 943, 795, 384, 383, 461, 404, 758, 839, 887,
|
|
||||||
715, 67, 618, 276, 204, 918, 873, 777, 604, 560,
|
|
||||||
951, 160, 578, 722, 79, 804, 96, 409, 713, 940,
|
|
||||||
652, 934, 970, 447, 318, 353, 859, 672, 112, 785,
|
|
||||||
645, 863, 803, 350, 139, 93, 354, 99, 820, 908,
|
|
||||||
609, 772, 154, 274, 580, 184, 79, 626, 630, 742,
|
|
||||||
653, 282, 762, 623, 680, 81, 927, 626, 789, 125,
|
|
||||||
411, 521, 938, 300, 821, 78, 343, 175, 128, 250,
|
|
||||||
170, 774, 972, 275, 999, 639, 495, 78, 352, 126,
|
|
||||||
857, 956, 358, 619, 580, 124, 737, 594, 701, 612,
|
|
||||||
669, 112, 134, 694, 363, 992, 809, 743, 168, 974,
|
|
||||||
944, 375, 748, 52, 600, 747, 642, 182, 862, 81,
|
|
||||||
344, 805, 988, 739, 511, 655, 814, 334, 249, 515,
|
|
||||||
897, 955, 664, 981, 649, 113, 974, 459, 893, 228,
|
|
||||||
433, 837, 553, 268, 926, 240, 102, 654, 459, 51,
|
|
||||||
686, 754, 806, 760, 493, 403, 415, 394, 687, 700,
|
|
||||||
946, 670, 656, 610, 738, 392, 760, 799, 887, 653,
|
|
||||||
978, 321, 576, 617, 626, 502, 894, 679, 243, 440,
|
|
||||||
680, 879, 194, 572, 640, 724, 926, 56, 204, 700,
|
|
||||||
707, 151, 457, 449, 797, 195, 791, 558, 945, 679,
|
|
||||||
297, 59, 87, 824, 713, 663, 412, 693, 342, 606,
|
|
||||||
134, 108, 571, 364, 631, 212, 174, 643, 304, 329,
|
|
||||||
343, 97, 430, 751, 497, 314, 983, 374, 822, 928,
|
|
||||||
140, 206, 73, 263, 980, 736, 876, 478, 430, 305,
|
|
||||||
170, 514, 364, 692, 829, 82, 855, 953, 676, 246,
|
|
||||||
369, 970, 294, 750, 807, 827, 150, 790, 288, 923,
|
|
||||||
804, 378, 215, 828, 592, 281, 565, 555, 710, 82,
|
|
||||||
896, 831, 547, 261, 524, 462, 293, 465, 502, 56,
|
|
||||||
661, 821, 976, 991, 658, 869, 905, 758, 745, 193,
|
|
||||||
768, 550, 608, 933, 378, 286, 215, 979, 792, 961,
|
|
||||||
61, 688, 793, 644, 986, 403, 106, 366, 905, 644,
|
|
||||||
372, 567, 466, 434, 645, 210, 389, 550, 919, 135,
|
|
||||||
780, 773, 635, 389, 707, 100, 626, 958, 165, 504,
|
|
||||||
920, 176, 193, 713, 857, 265, 203, 50, 668, 108,
|
|
||||||
645, 990, 626, 197, 510, 357, 358, 850, 858, 364,
|
|
||||||
936, 638
|
|
||||||
};
|
|
||||||
}
|
|
@ -1,865 +0,0 @@
|
|||||||
/*
|
|
||||||
* The Apache Software License, Version 1.1
|
|
||||||
*
|
|
||||||
* Copyright (c) 2001-2003 The Apache Software Foundation. All rights
|
|
||||||
* reserved.
|
|
||||||
*
|
|
||||||
* Redistribution and use in source and binary forms, with or without
|
|
||||||
* modification, are permitted provided that the following conditions
|
|
||||||
* are met:
|
|
||||||
*
|
|
||||||
* 1. Redistributions of source code must retain the above copyright
|
|
||||||
* notice, this list of conditions and the following disclaimer.
|
|
||||||
*
|
|
||||||
* 2. Redistributions in binary form must reproduce the above copyright
|
|
||||||
* notice, this list of conditions and the following disclaimer in
|
|
||||||
* the documentation and/or other materials provided with the
|
|
||||||
* distribution.
|
|
||||||
*
|
|
||||||
* 3. The end-user documentation included with the redistribution, if
|
|
||||||
* any, must include the following acknowlegement:
|
|
||||||
* "This product includes software developed by the
|
|
||||||
* Apache Software Foundation (http://www.apache.org/)."
|
|
||||||
* Alternately, this acknowlegement may appear in the software itself,
|
|
||||||
* if and wherever such third-party acknowlegements normally appear.
|
|
||||||
*
|
|
||||||
* 4. The names "Ant" and "Apache Software
|
|
||||||
* Foundation" must not be used to endorse or promote products derived
|
|
||||||
* from this software without prior written permission. For written
|
|
||||||
* permission, please contact apache@apache.org.
|
|
||||||
*
|
|
||||||
* 5. Products derived from this software may not be called "Apache"
|
|
||||||
* nor may "Apache" appear in their names without prior written
|
|
||||||
* permission of the Apache Group.
|
|
||||||
*
|
|
||||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
|
||||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
|
||||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
||||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
|
||||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
||||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
||||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
|
||||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
|
||||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
||||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
|
||||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
||||||
* SUCH DAMAGE.
|
|
||||||
* ====================================================================
|
|
||||||
*
|
|
||||||
* This software consists of voluntary contributions made by many
|
|
||||||
* individuals on behalf of the Apache Software Foundation. For more
|
|
||||||
* information on the Apache Software Foundation, please see
|
|
||||||
* <http://www.apache.org/>.
|
|
||||||
*/
|
|
||||||
|
|
||||||
/*
|
|
||||||
* This package is based on the work done by Keiron Liddle, Aftex Software
|
|
||||||
* <keiron@aftexsw.com> to whom the Ant project is very grateful for his
|
|
||||||
* great code.
|
|
||||||
*/
|
|
||||||
package org.apache.tools.bzip2;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.InputStream;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* An input stream that decompresses from the BZip2 format (without the file
|
|
||||||
* header chars) to be read as any other stream.
|
|
||||||
*
|
|
||||||
* @author <a href="mailto:keiron@aftexsw.com">Keiron Liddle</a>
|
|
||||||
*/
|
|
||||||
public class CBZip2InputStream extends InputStream implements BZip2Constants {
|
|
||||||
private static void cadvise() {
|
|
||||||
System.out.println("CRC Error");
|
|
||||||
//throw new CCoruptionError();
|
|
||||||
}
|
|
||||||
|
|
||||||
private static void compressedStreamEOF() {
|
|
||||||
cadvise();
|
|
||||||
}
|
|
||||||
|
|
||||||
private void makeMaps() {
|
|
||||||
int i;
|
|
||||||
nInUse = 0;
|
|
||||||
for (i = 0; i < 256; i++) {
|
|
||||||
if (inUse[i]) {
|
|
||||||
seqToUnseq[nInUse] = (char) i;
|
|
||||||
unseqToSeq[i] = (char) nInUse;
|
|
||||||
nInUse++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
index of the last char in the block, so
|
|
||||||
the block size == last + 1.
|
|
||||||
*/
|
|
||||||
private int last;
|
|
||||||
|
|
||||||
/*
|
|
||||||
index in zptr[] of original string after sorting.
|
|
||||||
*/
|
|
||||||
private int origPtr;
|
|
||||||
|
|
||||||
/*
|
|
||||||
always: in the range 0 .. 9.
|
|
||||||
The current block size is 100000 * this number.
|
|
||||||
*/
|
|
||||||
private int blockSize100k;
|
|
||||||
|
|
||||||
private boolean blockRandomised;
|
|
||||||
|
|
||||||
private int bsBuff;
|
|
||||||
private int bsLive;
|
|
||||||
private CRC mCrc = new CRC();
|
|
||||||
|
|
||||||
private boolean[] inUse = new boolean[256];
|
|
||||||
private int nInUse;
|
|
||||||
|
|
||||||
private char[] seqToUnseq = new char[256];
|
|
||||||
private char[] unseqToSeq = new char[256];
|
|
||||||
|
|
||||||
private char[] selector = new char[MAX_SELECTORS];
|
|
||||||
private char[] selectorMtf = new char[MAX_SELECTORS];
|
|
||||||
|
|
||||||
private int[] tt;
|
|
||||||
private char[] ll8;
|
|
||||||
|
|
||||||
/*
|
|
||||||
freq table collected to save a pass over the data
|
|
||||||
during decompression.
|
|
||||||
*/
|
|
||||||
private int[] unzftab = new int[256];
|
|
||||||
|
|
||||||
private int[][] limit = new int[N_GROUPS][MAX_ALPHA_SIZE];
|
|
||||||
private int[][] base = new int[N_GROUPS][MAX_ALPHA_SIZE];
|
|
||||||
private int[][] perm = new int[N_GROUPS][MAX_ALPHA_SIZE];
|
|
||||||
private int[] minLens = new int[N_GROUPS];
|
|
||||||
|
|
||||||
private InputStream bsStream;
|
|
||||||
|
|
||||||
private boolean streamEnd = false;
|
|
||||||
|
|
||||||
private int currentChar = -1;
|
|
||||||
|
|
||||||
private static final int START_BLOCK_STATE = 1;
|
|
||||||
private static final int RAND_PART_A_STATE = 2;
|
|
||||||
private static final int RAND_PART_B_STATE = 3;
|
|
||||||
private static final int RAND_PART_C_STATE = 4;
|
|
||||||
private static final int NO_RAND_PART_A_STATE = 5;
|
|
||||||
private static final int NO_RAND_PART_B_STATE = 6;
|
|
||||||
private static final int NO_RAND_PART_C_STATE = 7;
|
|
||||||
|
|
||||||
private int currentState = START_BLOCK_STATE;
|
|
||||||
|
|
||||||
private int storedBlockCRC, storedCombinedCRC;
|
|
||||||
private int computedBlockCRC, computedCombinedCRC;
|
|
||||||
|
|
||||||
int i2, count, chPrev, ch2;
|
|
||||||
int i, tPos;
|
|
||||||
int rNToGo = 0;
|
|
||||||
int rTPos = 0;
|
|
||||||
int j2;
|
|
||||||
char z;
|
|
||||||
|
|
||||||
public CBZip2InputStream(InputStream zStream) {
|
|
||||||
ll8 = null;
|
|
||||||
tt = null;
|
|
||||||
bsSetStream(zStream);
|
|
||||||
initialize();
|
|
||||||
initBlock();
|
|
||||||
setupBlock();
|
|
||||||
}
|
|
||||||
|
|
||||||
public int read() {
|
|
||||||
if (streamEnd) {
|
|
||||||
return -1;
|
|
||||||
} else {
|
|
||||||
int retChar = currentChar;
|
|
||||||
switch(currentState) {
|
|
||||||
case START_BLOCK_STATE:
|
|
||||||
break;
|
|
||||||
case RAND_PART_A_STATE:
|
|
||||||
break;
|
|
||||||
case RAND_PART_B_STATE:
|
|
||||||
setupRandPartB();
|
|
||||||
break;
|
|
||||||
case RAND_PART_C_STATE:
|
|
||||||
setupRandPartC();
|
|
||||||
break;
|
|
||||||
case NO_RAND_PART_A_STATE:
|
|
||||||
break;
|
|
||||||
case NO_RAND_PART_B_STATE:
|
|
||||||
setupNoRandPartB();
|
|
||||||
break;
|
|
||||||
case NO_RAND_PART_C_STATE:
|
|
||||||
setupNoRandPartC();
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
return retChar;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void initialize() {
|
|
||||||
char magic3, magic4;
|
|
||||||
magic3 = bsGetUChar();
|
|
||||||
magic4 = bsGetUChar();
|
|
||||||
if (magic3 != 'h' || magic4 < '1' || magic4 > '9') {
|
|
||||||
bsFinishedWithStream();
|
|
||||||
streamEnd = true;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
setDecompressStructureSizes(magic4 - '0');
|
|
||||||
computedCombinedCRC = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void initBlock() {
|
|
||||||
char magic1, magic2, magic3, magic4;
|
|
||||||
char magic5, magic6;
|
|
||||||
magic1 = bsGetUChar();
|
|
||||||
magic2 = bsGetUChar();
|
|
||||||
magic3 = bsGetUChar();
|
|
||||||
magic4 = bsGetUChar();
|
|
||||||
magic5 = bsGetUChar();
|
|
||||||
magic6 = bsGetUChar();
|
|
||||||
if (magic1 == 0x17 && magic2 == 0x72 && magic3 == 0x45
|
|
||||||
&& magic4 == 0x38 && magic5 == 0x50 && magic6 == 0x90) {
|
|
||||||
complete();
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (magic1 != 0x31 || magic2 != 0x41 || magic3 != 0x59
|
|
||||||
|| magic4 != 0x26 || magic5 != 0x53 || magic6 != 0x59) {
|
|
||||||
badBlockHeader();
|
|
||||||
streamEnd = true;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
storedBlockCRC = bsGetInt32();
|
|
||||||
|
|
||||||
if (bsR(1) == 1) {
|
|
||||||
blockRandomised = true;
|
|
||||||
} else {
|
|
||||||
blockRandomised = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// currBlockNo++;
|
|
||||||
getAndMoveToFrontDecode();
|
|
||||||
|
|
||||||
mCrc.initialiseCRC();
|
|
||||||
currentState = START_BLOCK_STATE;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void endBlock() {
|
|
||||||
computedBlockCRC = mCrc.getFinalCRC();
|
|
||||||
/* A bad CRC is considered a fatal error. */
|
|
||||||
if (storedBlockCRC != computedBlockCRC) {
|
|
||||||
crcError();
|
|
||||||
}
|
|
||||||
|
|
||||||
computedCombinedCRC = (computedCombinedCRC << 1)
|
|
||||||
| (computedCombinedCRC >>> 31);
|
|
||||||
computedCombinedCRC ^= computedBlockCRC;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void complete() {
|
|
||||||
storedCombinedCRC = bsGetInt32();
|
|
||||||
if (storedCombinedCRC != computedCombinedCRC) {
|
|
||||||
crcError();
|
|
||||||
}
|
|
||||||
|
|
||||||
bsFinishedWithStream();
|
|
||||||
streamEnd = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static void blockOverrun() {
|
|
||||||
cadvise();
|
|
||||||
}
|
|
||||||
|
|
||||||
private static void badBlockHeader() {
|
|
||||||
cadvise();
|
|
||||||
}
|
|
||||||
|
|
||||||
private static void crcError() {
|
|
||||||
cadvise();
|
|
||||||
}
|
|
||||||
|
|
||||||
private void bsFinishedWithStream() {
|
|
||||||
try {
|
|
||||||
if (this.bsStream != null) {
|
|
||||||
if (this.bsStream != System.in) {
|
|
||||||
this.bsStream.close();
|
|
||||||
this.bsStream = null;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (IOException ioe) {
|
|
||||||
//ignore
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void bsSetStream(InputStream f) {
|
|
||||||
bsStream = f;
|
|
||||||
bsLive = 0;
|
|
||||||
bsBuff = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
private int bsR(int n) {
|
|
||||||
int v;
|
|
||||||
while (bsLive < n) {
|
|
||||||
int zzi;
|
|
||||||
char thech = 0;
|
|
||||||
try {
|
|
||||||
thech = (char) bsStream.read();
|
|
||||||
} catch (IOException e) {
|
|
||||||
compressedStreamEOF();
|
|
||||||
}
|
|
||||||
if (thech == -1) {
|
|
||||||
compressedStreamEOF();
|
|
||||||
}
|
|
||||||
zzi = thech;
|
|
||||||
bsBuff = (bsBuff << 8) | (zzi & 0xff);
|
|
||||||
bsLive += 8;
|
|
||||||
}
|
|
||||||
|
|
||||||
v = (bsBuff >> (bsLive - n)) & ((1 << n) - 1);
|
|
||||||
bsLive -= n;
|
|
||||||
return v;
|
|
||||||
}
|
|
||||||
|
|
||||||
private char bsGetUChar() {
|
|
||||||
return (char) bsR(8);
|
|
||||||
}
|
|
||||||
|
|
||||||
private int bsGetint() {
|
|
||||||
int u = 0;
|
|
||||||
u = (u << 8) | bsR(8);
|
|
||||||
u = (u << 8) | bsR(8);
|
|
||||||
u = (u << 8) | bsR(8);
|
|
||||||
u = (u << 8) | bsR(8);
|
|
||||||
return u;
|
|
||||||
}
|
|
||||||
|
|
||||||
private int bsGetIntVS(int numBits) {
|
|
||||||
return (int) bsR(numBits);
|
|
||||||
}
|
|
||||||
|
|
||||||
private int bsGetInt32() {
|
|
||||||
return (int) bsGetint();
|
|
||||||
}
|
|
||||||
|
|
||||||
private void hbCreateDecodeTables(int[] limit, int[] base,
|
|
||||||
int[] perm, char[] length,
|
|
||||||
int minLen, int maxLen, int alphaSize) {
|
|
||||||
int pp, i, j, vec;
|
|
||||||
|
|
||||||
pp = 0;
|
|
||||||
for (i = minLen; i <= maxLen; i++) {
|
|
||||||
for (j = 0; j < alphaSize; j++) {
|
|
||||||
if (length[j] == i) {
|
|
||||||
perm[pp] = j;
|
|
||||||
pp++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (i = 0; i < MAX_CODE_LEN; i++) {
|
|
||||||
base[i] = 0;
|
|
||||||
}
|
|
||||||
for (i = 0; i < alphaSize; i++) {
|
|
||||||
base[length[i] + 1]++;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (i = 1; i < MAX_CODE_LEN; i++) {
|
|
||||||
base[i] += base[i - 1];
|
|
||||||
}
|
|
||||||
|
|
||||||
for (i = 0; i < MAX_CODE_LEN; i++) {
|
|
||||||
limit[i] = 0;
|
|
||||||
}
|
|
||||||
vec = 0;
|
|
||||||
|
|
||||||
for (i = minLen; i <= maxLen; i++) {
|
|
||||||
vec += (base[i + 1] - base[i]);
|
|
||||||
limit[i] = vec - 1;
|
|
||||||
vec <<= 1;
|
|
||||||
}
|
|
||||||
for (i = minLen + 1; i <= maxLen; i++) {
|
|
||||||
base[i] = ((limit[i - 1] + 1) << 1) - base[i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void recvDecodingTables() {
|
|
||||||
char len[][] = new char[N_GROUPS][MAX_ALPHA_SIZE];
|
|
||||||
int i, j, t, nGroups, nSelectors, alphaSize;
|
|
||||||
int minLen, maxLen;
|
|
||||||
boolean[] inUse16 = new boolean[16];
|
|
||||||
|
|
||||||
/* Receive the mapping table */
|
|
||||||
for (i = 0; i < 16; i++) {
|
|
||||||
if (bsR(1) == 1) {
|
|
||||||
inUse16[i] = true;
|
|
||||||
} else {
|
|
||||||
inUse16[i] = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (i = 0; i < 256; i++) {
|
|
||||||
inUse[i] = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (i = 0; i < 16; i++) {
|
|
||||||
if (inUse16[i]) {
|
|
||||||
for (j = 0; j < 16; j++) {
|
|
||||||
if (bsR(1) == 1) {
|
|
||||||
inUse[i * 16 + j] = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
makeMaps();
|
|
||||||
alphaSize = nInUse + 2;
|
|
||||||
|
|
||||||
/* Now the selectors */
|
|
||||||
nGroups = bsR(3);
|
|
||||||
nSelectors = bsR(15);
|
|
||||||
for (i = 0; i < nSelectors; i++) {
|
|
||||||
j = 0;
|
|
||||||
while (bsR(1) == 1) {
|
|
||||||
j++;
|
|
||||||
}
|
|
||||||
selectorMtf[i] = (char) j;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Undo the MTF values for the selectors. */
|
|
||||||
{
|
|
||||||
char[] pos = new char[N_GROUPS];
|
|
||||||
char tmp, v;
|
|
||||||
for (v = 0; v < nGroups; v++) {
|
|
||||||
pos[v] = v;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (i = 0; i < nSelectors; i++) {
|
|
||||||
v = selectorMtf[i];
|
|
||||||
tmp = pos[v];
|
|
||||||
while (v > 0) {
|
|
||||||
pos[v] = pos[v - 1];
|
|
||||||
v--;
|
|
||||||
}
|
|
||||||
pos[0] = tmp;
|
|
||||||
selector[i] = tmp;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Now the coding tables */
|
|
||||||
for (t = 0; t < nGroups; t++) {
|
|
||||||
int curr = bsR(5);
|
|
||||||
for (i = 0; i < alphaSize; i++) {
|
|
||||||
while (bsR(1) == 1) {
|
|
||||||
if (bsR(1) == 0) {
|
|
||||||
curr++;
|
|
||||||
} else {
|
|
||||||
curr--;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
len[t][i] = (char) curr;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Create the Huffman decoding tables */
|
|
||||||
for (t = 0; t < nGroups; t++) {
|
|
||||||
minLen = 32;
|
|
||||||
maxLen = 0;
|
|
||||||
for (i = 0; i < alphaSize; i++) {
|
|
||||||
if (len[t][i] > maxLen) {
|
|
||||||
maxLen = len[t][i];
|
|
||||||
}
|
|
||||||
if (len[t][i] < minLen) {
|
|
||||||
minLen = len[t][i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
hbCreateDecodeTables(limit[t], base[t], perm[t], len[t], minLen,
|
|
||||||
maxLen, alphaSize);
|
|
||||||
minLens[t] = minLen;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void getAndMoveToFrontDecode() {
|
|
||||||
char[] yy = new char[256];
|
|
||||||
int i, j, nextSym, limitLast;
|
|
||||||
int EOB, groupNo, groupPos;
|
|
||||||
|
|
||||||
limitLast = baseBlockSize * blockSize100k;
|
|
||||||
origPtr = bsGetIntVS(24);
|
|
||||||
|
|
||||||
recvDecodingTables();
|
|
||||||
EOB = nInUse + 1;
|
|
||||||
groupNo = -1;
|
|
||||||
groupPos = 0;
|
|
||||||
|
|
||||||
/*
|
|
||||||
Setting up the unzftab entries here is not strictly
|
|
||||||
necessary, but it does save having to do it later
|
|
||||||
in a separate pass, and so saves a block's worth of
|
|
||||||
cache misses.
|
|
||||||
*/
|
|
||||||
for (i = 0; i <= 255; i++) {
|
|
||||||
unzftab[i] = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (i = 0; i <= 255; i++) {
|
|
||||||
yy[i] = (char) i;
|
|
||||||
}
|
|
||||||
|
|
||||||
last = -1;
|
|
||||||
|
|
||||||
{
|
|
||||||
int zt, zn, zvec, zj;
|
|
||||||
if (groupPos == 0) {
|
|
||||||
groupNo++;
|
|
||||||
groupPos = G_SIZE;
|
|
||||||
}
|
|
||||||
groupPos--;
|
|
||||||
zt = selector[groupNo];
|
|
||||||
zn = minLens[zt];
|
|
||||||
zvec = bsR(zn);
|
|
||||||
while (zvec > limit[zt][zn]) {
|
|
||||||
zn++;
|
|
||||||
{
|
|
||||||
{
|
|
||||||
while (bsLive < 1) {
|
|
||||||
int zzi;
|
|
||||||
char thech = 0;
|
|
||||||
try {
|
|
||||||
thech = (char) bsStream.read();
|
|
||||||
} catch (IOException e) {
|
|
||||||
compressedStreamEOF();
|
|
||||||
}
|
|
||||||
if (thech == -1) {
|
|
||||||
compressedStreamEOF();
|
|
||||||
}
|
|
||||||
zzi = thech;
|
|
||||||
bsBuff = (bsBuff << 8) | (zzi & 0xff);
|
|
||||||
bsLive += 8;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
zj = (bsBuff >> (bsLive - 1)) & 1;
|
|
||||||
bsLive--;
|
|
||||||
}
|
|
||||||
zvec = (zvec << 1) | zj;
|
|
||||||
}
|
|
||||||
nextSym = perm[zt][zvec - base[zt][zn]];
|
|
||||||
}
|
|
||||||
|
|
||||||
while (true) {
|
|
||||||
|
|
||||||
if (nextSym == EOB) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (nextSym == RUNA || nextSym == RUNB) {
|
|
||||||
char ch;
|
|
||||||
int s = -1;
|
|
||||||
int N = 1;
|
|
||||||
do {
|
|
||||||
if (nextSym == RUNA) {
|
|
||||||
s = s + (0 + 1) * N;
|
|
||||||
} else if (nextSym == RUNB) {
|
|
||||||
s = s + (1 + 1) * N;
|
|
||||||
}
|
|
||||||
N = N * 2;
|
|
||||||
{
|
|
||||||
int zt, zn, zvec, zj;
|
|
||||||
if (groupPos == 0) {
|
|
||||||
groupNo++;
|
|
||||||
groupPos = G_SIZE;
|
|
||||||
}
|
|
||||||
groupPos--;
|
|
||||||
zt = selector[groupNo];
|
|
||||||
zn = minLens[zt];
|
|
||||||
zvec = bsR(zn);
|
|
||||||
while (zvec > limit[zt][zn]) {
|
|
||||||
zn++;
|
|
||||||
{
|
|
||||||
{
|
|
||||||
while (bsLive < 1) {
|
|
||||||
int zzi;
|
|
||||||
char thech = 0;
|
|
||||||
try {
|
|
||||||
thech = (char) bsStream.read();
|
|
||||||
} catch (IOException e) {
|
|
||||||
compressedStreamEOF();
|
|
||||||
}
|
|
||||||
if (thech == -1) {
|
|
||||||
compressedStreamEOF();
|
|
||||||
}
|
|
||||||
zzi = thech;
|
|
||||||
bsBuff = (bsBuff << 8) | (zzi & 0xff);
|
|
||||||
bsLive += 8;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
zj = (bsBuff >> (bsLive - 1)) & 1;
|
|
||||||
bsLive--;
|
|
||||||
}
|
|
||||||
zvec = (zvec << 1) | zj;
|
|
||||||
}
|
|
||||||
nextSym = perm[zt][zvec - base[zt][zn]];
|
|
||||||
}
|
|
||||||
} while (nextSym == RUNA || nextSym == RUNB);
|
|
||||||
|
|
||||||
s++;
|
|
||||||
ch = seqToUnseq[yy[0]];
|
|
||||||
unzftab[ch] += s;
|
|
||||||
|
|
||||||
while (s > 0) {
|
|
||||||
last++;
|
|
||||||
ll8[last] = ch;
|
|
||||||
s--;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (last >= limitLast) {
|
|
||||||
blockOverrun();
|
|
||||||
}
|
|
||||||
continue;
|
|
||||||
} else {
|
|
||||||
char tmp;
|
|
||||||
last++;
|
|
||||||
if (last >= limitLast) {
|
|
||||||
blockOverrun();
|
|
||||||
}
|
|
||||||
|
|
||||||
tmp = yy[nextSym - 1];
|
|
||||||
unzftab[seqToUnseq[tmp]]++;
|
|
||||||
ll8[last] = seqToUnseq[tmp];
|
|
||||||
|
|
||||||
/*
|
|
||||||
This loop is hammered during decompression,
|
|
||||||
hence the unrolling.
|
|
||||||
|
|
||||||
for (j = nextSym-1; j > 0; j--) yy[j] = yy[j-1];
|
|
||||||
*/
|
|
||||||
|
|
||||||
j = nextSym - 1;
|
|
||||||
for (; j > 3; j -= 4) {
|
|
||||||
yy[j] = yy[j - 1];
|
|
||||||
yy[j - 1] = yy[j - 2];
|
|
||||||
yy[j - 2] = yy[j - 3];
|
|
||||||
yy[j - 3] = yy[j - 4];
|
|
||||||
}
|
|
||||||
for (; j > 0; j--) {
|
|
||||||
yy[j] = yy[j - 1];
|
|
||||||
}
|
|
||||||
|
|
||||||
yy[0] = tmp;
|
|
||||||
{
|
|
||||||
int zt, zn, zvec, zj;
|
|
||||||
if (groupPos == 0) {
|
|
||||||
groupNo++;
|
|
||||||
groupPos = G_SIZE;
|
|
||||||
}
|
|
||||||
groupPos--;
|
|
||||||
zt = selector[groupNo];
|
|
||||||
zn = minLens[zt];
|
|
||||||
zvec = bsR(zn);
|
|
||||||
while (zvec > limit[zt][zn]) {
|
|
||||||
zn++;
|
|
||||||
{
|
|
||||||
{
|
|
||||||
while (bsLive < 1) {
|
|
||||||
int zzi;
|
|
||||||
char thech = 0;
|
|
||||||
try {
|
|
||||||
thech = (char) bsStream.read();
|
|
||||||
} catch (IOException e) {
|
|
||||||
compressedStreamEOF();
|
|
||||||
}
|
|
||||||
zzi = thech;
|
|
||||||
bsBuff = (bsBuff << 8) | (zzi & 0xff);
|
|
||||||
bsLive += 8;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
zj = (bsBuff >> (bsLive - 1)) & 1;
|
|
||||||
bsLive--;
|
|
||||||
}
|
|
||||||
zvec = (zvec << 1) | zj;
|
|
||||||
}
|
|
||||||
nextSym = perm[zt][zvec - base[zt][zn]];
|
|
||||||
}
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void setupBlock() {
|
|
||||||
int[] cftab = new int[257];
|
|
||||||
char ch;
|
|
||||||
|
|
||||||
cftab[0] = 0;
|
|
||||||
for (i = 1; i <= 256; i++) {
|
|
||||||
cftab[i] = unzftab[i - 1];
|
|
||||||
}
|
|
||||||
for (i = 1; i <= 256; i++) {
|
|
||||||
cftab[i] += cftab[i - 1];
|
|
||||||
}
|
|
||||||
|
|
||||||
for (i = 0; i <= last; i++) {
|
|
||||||
ch = (char) ll8[i];
|
|
||||||
tt[cftab[ch]] = i;
|
|
||||||
cftab[ch]++;
|
|
||||||
}
|
|
||||||
cftab = null;
|
|
||||||
|
|
||||||
tPos = tt[origPtr];
|
|
||||||
|
|
||||||
count = 0;
|
|
||||||
i2 = 0;
|
|
||||||
ch2 = 256; /* not a char and not EOF */
|
|
||||||
|
|
||||||
if (blockRandomised) {
|
|
||||||
rNToGo = 0;
|
|
||||||
rTPos = 0;
|
|
||||||
setupRandPartA();
|
|
||||||
} else {
|
|
||||||
setupNoRandPartA();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void setupRandPartA() {
|
|
||||||
if (i2 <= last) {
|
|
||||||
chPrev = ch2;
|
|
||||||
ch2 = ll8[tPos];
|
|
||||||
tPos = tt[tPos];
|
|
||||||
if (rNToGo == 0) {
|
|
||||||
rNToGo = rNums[rTPos];
|
|
||||||
rTPos++;
|
|
||||||
if (rTPos == 512) {
|
|
||||||
rTPos = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
rNToGo--;
|
|
||||||
ch2 ^= (int) ((rNToGo == 1) ? 1 : 0);
|
|
||||||
i2++;
|
|
||||||
|
|
||||||
currentChar = ch2;
|
|
||||||
currentState = RAND_PART_B_STATE;
|
|
||||||
mCrc.updateCRC(ch2);
|
|
||||||
} else {
|
|
||||||
endBlock();
|
|
||||||
initBlock();
|
|
||||||
setupBlock();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void setupNoRandPartA() {
|
|
||||||
if (i2 <= last) {
|
|
||||||
chPrev = ch2;
|
|
||||||
ch2 = ll8[tPos];
|
|
||||||
tPos = tt[tPos];
|
|
||||||
i2++;
|
|
||||||
|
|
||||||
currentChar = ch2;
|
|
||||||
currentState = NO_RAND_PART_B_STATE;
|
|
||||||
mCrc.updateCRC(ch2);
|
|
||||||
} else {
|
|
||||||
endBlock();
|
|
||||||
initBlock();
|
|
||||||
setupBlock();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void setupRandPartB() {
|
|
||||||
if (ch2 != chPrev) {
|
|
||||||
currentState = RAND_PART_A_STATE;
|
|
||||||
count = 1;
|
|
||||||
setupRandPartA();
|
|
||||||
} else {
|
|
||||||
count++;
|
|
||||||
if (count >= 4) {
|
|
||||||
z = ll8[tPos];
|
|
||||||
tPos = tt[tPos];
|
|
||||||
if (rNToGo == 0) {
|
|
||||||
rNToGo = rNums[rTPos];
|
|
||||||
rTPos++;
|
|
||||||
if (rTPos == 512) {
|
|
||||||
rTPos = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
rNToGo--;
|
|
||||||
z ^= ((rNToGo == 1) ? 1 : 0);
|
|
||||||
j2 = 0;
|
|
||||||
currentState = RAND_PART_C_STATE;
|
|
||||||
setupRandPartC();
|
|
||||||
} else {
|
|
||||||
currentState = RAND_PART_A_STATE;
|
|
||||||
setupRandPartA();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void setupRandPartC() {
|
|
||||||
if (j2 < (int) z) {
|
|
||||||
currentChar = ch2;
|
|
||||||
mCrc.updateCRC(ch2);
|
|
||||||
j2++;
|
|
||||||
} else {
|
|
||||||
currentState = RAND_PART_A_STATE;
|
|
||||||
i2++;
|
|
||||||
count = 0;
|
|
||||||
setupRandPartA();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void setupNoRandPartB() {
|
|
||||||
if (ch2 != chPrev) {
|
|
||||||
currentState = NO_RAND_PART_A_STATE;
|
|
||||||
count = 1;
|
|
||||||
setupNoRandPartA();
|
|
||||||
} else {
|
|
||||||
count++;
|
|
||||||
if (count >= 4) {
|
|
||||||
z = ll8[tPos];
|
|
||||||
tPos = tt[tPos];
|
|
||||||
currentState = NO_RAND_PART_C_STATE;
|
|
||||||
j2 = 0;
|
|
||||||
setupNoRandPartC();
|
|
||||||
} else {
|
|
||||||
currentState = NO_RAND_PART_A_STATE;
|
|
||||||
setupNoRandPartA();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void setupNoRandPartC() {
|
|
||||||
if (j2 < (int) z) {
|
|
||||||
currentChar = ch2;
|
|
||||||
mCrc.updateCRC(ch2);
|
|
||||||
j2++;
|
|
||||||
} else {
|
|
||||||
currentState = NO_RAND_PART_A_STATE;
|
|
||||||
i2++;
|
|
||||||
count = 0;
|
|
||||||
setupNoRandPartA();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void setDecompressStructureSizes(int newSize100k) {
|
|
||||||
if (!(0 <= newSize100k && newSize100k <= 9 && 0 <= blockSize100k
|
|
||||||
&& blockSize100k <= 9)) {
|
|
||||||
// throw new IOException("Invalid block size");
|
|
||||||
}
|
|
||||||
|
|
||||||
blockSize100k = newSize100k;
|
|
||||||
|
|
||||||
if (newSize100k == 0) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
int n = baseBlockSize * newSize100k;
|
|
||||||
ll8 = new char[n];
|
|
||||||
tt = new int[n];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
@ -1,167 +0,0 @@
|
|||||||
/*
|
|
||||||
* The Apache Software License, Version 1.1
|
|
||||||
*
|
|
||||||
* Copyright (c) 2001-2002 The Apache Software Foundation. All rights
|
|
||||||
* reserved.
|
|
||||||
*
|
|
||||||
* Redistribution and use in source and binary forms, with or without
|
|
||||||
* modification, are permitted provided that the following conditions
|
|
||||||
* are met:
|
|
||||||
*
|
|
||||||
* 1. Redistributions of source code must retain the above copyright
|
|
||||||
* notice, this list of conditions and the following disclaimer.
|
|
||||||
*
|
|
||||||
* 2. Redistributions in binary form must reproduce the above copyright
|
|
||||||
* notice, this list of conditions and the following disclaimer in
|
|
||||||
* the documentation and/or other materials provided with the
|
|
||||||
* distribution.
|
|
||||||
*
|
|
||||||
* 3. The end-user documentation included with the redistribution, if
|
|
||||||
* any, must include the following acknowlegement:
|
|
||||||
* "This product includes software developed by the
|
|
||||||
* Apache Software Foundation (http://www.apache.org/)."
|
|
||||||
* Alternately, this acknowlegement may appear in the software itself,
|
|
||||||
* if and wherever such third-party acknowlegements normally appear.
|
|
||||||
*
|
|
||||||
* 4. The names "Ant" and "Apache Software
|
|
||||||
* Foundation" must not be used to endorse or promote products derived
|
|
||||||
* from this software without prior written permission. For written
|
|
||||||
* permission, please contact apache@apache.org.
|
|
||||||
*
|
|
||||||
* 5. Products derived from this software may not be called "Apache"
|
|
||||||
* nor may "Apache" appear in their names without prior written
|
|
||||||
* permission of the Apache Group.
|
|
||||||
*
|
|
||||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
|
||||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
|
||||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
||||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
|
||||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
||||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
||||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
|
||||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
|
||||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
||||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
|
||||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
||||||
* SUCH DAMAGE.
|
|
||||||
* ====================================================================
|
|
||||||
*
|
|
||||||
* This software consists of voluntary contributions made by many
|
|
||||||
* individuals on behalf of the Apache Software Foundation. For more
|
|
||||||
* information on the Apache Software Foundation, please see
|
|
||||||
* <http://www.apache.org/>.
|
|
||||||
*/
|
|
||||||
|
|
||||||
/*
|
|
||||||
* This package is based on the work done by Keiron Liddle, Aftex Software
|
|
||||||
* <keiron@aftexsw.com> to whom the Ant project is very grateful for his
|
|
||||||
* great code.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package org.apache.tools.bzip2;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* A simple class the hold and calculate the CRC for sanity checking
|
|
||||||
* of the data.
|
|
||||||
*
|
|
||||||
* @author <a href="mailto:keiron@aftexsw.com">Keiron Liddle</a>
|
|
||||||
*/
|
|
||||||
class CRC {
|
|
||||||
public static int crc32Table[] = {
|
|
||||||
0x00000000, 0x04c11db7, 0x09823b6e, 0x0d4326d9,
|
|
||||||
0x130476dc, 0x17c56b6b, 0x1a864db2, 0x1e475005,
|
|
||||||
0x2608edb8, 0x22c9f00f, 0x2f8ad6d6, 0x2b4bcb61,
|
|
||||||
0x350c9b64, 0x31cd86d3, 0x3c8ea00a, 0x384fbdbd,
|
|
||||||
0x4c11db70, 0x48d0c6c7, 0x4593e01e, 0x4152fda9,
|
|
||||||
0x5f15adac, 0x5bd4b01b, 0x569796c2, 0x52568b75,
|
|
||||||
0x6a1936c8, 0x6ed82b7f, 0x639b0da6, 0x675a1011,
|
|
||||||
0x791d4014, 0x7ddc5da3, 0x709f7b7a, 0x745e66cd,
|
|
||||||
0x9823b6e0, 0x9ce2ab57, 0x91a18d8e, 0x95609039,
|
|
||||||
0x8b27c03c, 0x8fe6dd8b, 0x82a5fb52, 0x8664e6e5,
|
|
||||||
0xbe2b5b58, 0xbaea46ef, 0xb7a96036, 0xb3687d81,
|
|
||||||
0xad2f2d84, 0xa9ee3033, 0xa4ad16ea, 0xa06c0b5d,
|
|
||||||
0xd4326d90, 0xd0f37027, 0xddb056fe, 0xd9714b49,
|
|
||||||
0xc7361b4c, 0xc3f706fb, 0xceb42022, 0xca753d95,
|
|
||||||
0xf23a8028, 0xf6fb9d9f, 0xfbb8bb46, 0xff79a6f1,
|
|
||||||
0xe13ef6f4, 0xe5ffeb43, 0xe8bccd9a, 0xec7dd02d,
|
|
||||||
0x34867077, 0x30476dc0, 0x3d044b19, 0x39c556ae,
|
|
||||||
0x278206ab, 0x23431b1c, 0x2e003dc5, 0x2ac12072,
|
|
||||||
0x128e9dcf, 0x164f8078, 0x1b0ca6a1, 0x1fcdbb16,
|
|
||||||
0x018aeb13, 0x054bf6a4, 0x0808d07d, 0x0cc9cdca,
|
|
||||||
0x7897ab07, 0x7c56b6b0, 0x71159069, 0x75d48dde,
|
|
||||||
0x6b93dddb, 0x6f52c06c, 0x6211e6b5, 0x66d0fb02,
|
|
||||||
0x5e9f46bf, 0x5a5e5b08, 0x571d7dd1, 0x53dc6066,
|
|
||||||
0x4d9b3063, 0x495a2dd4, 0x44190b0d, 0x40d816ba,
|
|
||||||
0xaca5c697, 0xa864db20, 0xa527fdf9, 0xa1e6e04e,
|
|
||||||
0xbfa1b04b, 0xbb60adfc, 0xb6238b25, 0xb2e29692,
|
|
||||||
0x8aad2b2f, 0x8e6c3698, 0x832f1041, 0x87ee0df6,
|
|
||||||
0x99a95df3, 0x9d684044, 0x902b669d, 0x94ea7b2a,
|
|
||||||
0xe0b41de7, 0xe4750050, 0xe9362689, 0xedf73b3e,
|
|
||||||
0xf3b06b3b, 0xf771768c, 0xfa325055, 0xfef34de2,
|
|
||||||
0xc6bcf05f, 0xc27dede8, 0xcf3ecb31, 0xcbffd686,
|
|
||||||
0xd5b88683, 0xd1799b34, 0xdc3abded, 0xd8fba05a,
|
|
||||||
0x690ce0ee, 0x6dcdfd59, 0x608edb80, 0x644fc637,
|
|
||||||
0x7a089632, 0x7ec98b85, 0x738aad5c, 0x774bb0eb,
|
|
||||||
0x4f040d56, 0x4bc510e1, 0x46863638, 0x42472b8f,
|
|
||||||
0x5c007b8a, 0x58c1663d, 0x558240e4, 0x51435d53,
|
|
||||||
0x251d3b9e, 0x21dc2629, 0x2c9f00f0, 0x285e1d47,
|
|
||||||
0x36194d42, 0x32d850f5, 0x3f9b762c, 0x3b5a6b9b,
|
|
||||||
0x0315d626, 0x07d4cb91, 0x0a97ed48, 0x0e56f0ff,
|
|
||||||
0x1011a0fa, 0x14d0bd4d, 0x19939b94, 0x1d528623,
|
|
||||||
0xf12f560e, 0xf5ee4bb9, 0xf8ad6d60, 0xfc6c70d7,
|
|
||||||
0xe22b20d2, 0xe6ea3d65, 0xeba91bbc, 0xef68060b,
|
|
||||||
0xd727bbb6, 0xd3e6a601, 0xdea580d8, 0xda649d6f,
|
|
||||||
0xc423cd6a, 0xc0e2d0dd, 0xcda1f604, 0xc960ebb3,
|
|
||||||
0xbd3e8d7e, 0xb9ff90c9, 0xb4bcb610, 0xb07daba7,
|
|
||||||
0xae3afba2, 0xaafbe615, 0xa7b8c0cc, 0xa379dd7b,
|
|
||||||
0x9b3660c6, 0x9ff77d71, 0x92b45ba8, 0x9675461f,
|
|
||||||
0x8832161a, 0x8cf30bad, 0x81b02d74, 0x857130c3,
|
|
||||||
0x5d8a9099, 0x594b8d2e, 0x5408abf7, 0x50c9b640,
|
|
||||||
0x4e8ee645, 0x4a4ffbf2, 0x470cdd2b, 0x43cdc09c,
|
|
||||||
0x7b827d21, 0x7f436096, 0x7200464f, 0x76c15bf8,
|
|
||||||
0x68860bfd, 0x6c47164a, 0x61043093, 0x65c52d24,
|
|
||||||
0x119b4be9, 0x155a565e, 0x18197087, 0x1cd86d30,
|
|
||||||
0x029f3d35, 0x065e2082, 0x0b1d065b, 0x0fdc1bec,
|
|
||||||
0x3793a651, 0x3352bbe6, 0x3e119d3f, 0x3ad08088,
|
|
||||||
0x2497d08d, 0x2056cd3a, 0x2d15ebe3, 0x29d4f654,
|
|
||||||
0xc5a92679, 0xc1683bce, 0xcc2b1d17, 0xc8ea00a0,
|
|
||||||
0xd6ad50a5, 0xd26c4d12, 0xdf2f6bcb, 0xdbee767c,
|
|
||||||
0xe3a1cbc1, 0xe760d676, 0xea23f0af, 0xeee2ed18,
|
|
||||||
0xf0a5bd1d, 0xf464a0aa, 0xf9278673, 0xfde69bc4,
|
|
||||||
0x89b8fd09, 0x8d79e0be, 0x803ac667, 0x84fbdbd0,
|
|
||||||
0x9abc8bd5, 0x9e7d9662, 0x933eb0bb, 0x97ffad0c,
|
|
||||||
0xafb010b1, 0xab710d06, 0xa6322bdf, 0xa2f33668,
|
|
||||||
0xbcb4666d, 0xb8757bda, 0xb5365d03, 0xb1f740b4
|
|
||||||
};
|
|
||||||
|
|
||||||
public CRC() {
|
|
||||||
initialiseCRC();
|
|
||||||
}
|
|
||||||
|
|
||||||
void initialiseCRC() {
|
|
||||||
globalCrc = 0xffffffff;
|
|
||||||
}
|
|
||||||
|
|
||||||
int getFinalCRC() {
|
|
||||||
return ~globalCrc;
|
|
||||||
}
|
|
||||||
|
|
||||||
int getGlobalCRC() {
|
|
||||||
return globalCrc;
|
|
||||||
}
|
|
||||||
|
|
||||||
void setGlobalCRC(int newCrc) {
|
|
||||||
globalCrc = newCrc;
|
|
||||||
}
|
|
||||||
|
|
||||||
void updateCRC(int inCh) {
|
|
||||||
int temp = (globalCrc >> 24) ^ inCh;
|
|
||||||
if (temp < 0) {
|
|
||||||
temp = 256 + temp;
|
|
||||||
}
|
|
||||||
globalCrc = (globalCrc << 8) ^ CRC.crc32Table[temp];
|
|
||||||
}
|
|
||||||
|
|
||||||
int globalCrc;
|
|
||||||
}
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user