- moved all code that contained MIT IP outside (http://code.google.com/p/simile-vicino/)
- moved bzip2 and tar code from apache ant into their own jar files - now gridworks source contains only com.metaweb.* code everything else is a jar dependency - started to work on archive importer git-svn-id: http://google-refine.googlecode.com/svn/trunk@376 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
parent
4eda7ae2c0
commit
72203cd3d5
@ -15,11 +15,12 @@
|
||||
<classpathentry kind="lib" path="lib/arithcode-1.1.jar" sourcepath="lib-src/arithcode-1.1-sources.jar"/>
|
||||
<classpathentry kind="lib" path="lib/jdatapath-alpha2.jar" sourcepath="lib-src/jdatapath-alpha2-sources.jar"/>
|
||||
<classpathentry kind="lib" path="lib/secondstring-20100303.jar" sourcepath="lib-src/secondstring-20100303-sources.jar"/>
|
||||
<classpathentry kind="lib" path="lib/ant-tools-1.8.0.jar" sourcepath="lib-src/ant-tools-1.8.0-sources.jar"/>
|
||||
<classpathentry kind="lib" path="lib/vicino-1.1.jar" sourcepath="lib-src/vicino-1.1-sources.jar"/>
|
||||
<classpathentry kind="lib" path="lib/poi-3.6.jar"/>
|
||||
<classpathentry kind="lib" path="lib/poi-ooxml-3.6.jar"/>
|
||||
<classpathentry kind="lib" path="lib/apache-tools-tar.jar"/>
|
||||
<classpathentry kind="lib" path="tests/lib/junit-4.8.1.jar" sourcepath="tests/lib-src/junit-4.8.1-sources.jar"/>
|
||||
<classpathentry kind="lib" path="lib/jython-2.5.1.jar"/>
|
||||
<classpathentry kind="lib" path="lib/clojure-1.1.0.jar"/>
|
||||
<classpathentry kind="lib" path="tests/lib/junit-4.8.1.jar" sourcepath="tests/lib-src/junit-4.8.1-sources.jar"/>
|
||||
<classpathentry kind="output" path="build/classes"/>
|
||||
</classpath>
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2010, Metaweb Technologies, Inc. All rights reserved.
|
||||
* Copyright (c) 2010 Metaweb Technologies, Inc. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
@ -36,9 +36,9 @@ See the 'licenses' directory for a list of the licenses for the libraries we dep
|
||||
ordered here by license:
|
||||
|
||||
licenses/apache2.0.LICENSE.txt
|
||||
ant (package org.apache.tools.tar)
|
||||
bzip2 (package org.apache.tools.bzip2)
|
||||
calendar-parser (package com.metaweb.gridworks.expr.util)
|
||||
ant-tools
|
||||
bzip2
|
||||
commons-lang
|
||||
commons-codec
|
||||
jdatapath
|
||||
@ -58,7 +58,7 @@ licenses/dom4j.LICENSE.txt (BSD family)
|
||||
dom4j
|
||||
|
||||
licenses/simile.LICENSE.txt (BSD family)
|
||||
vicino (package edu.mit.simile.vicino)
|
||||
vicino
|
||||
|
||||
licenses/arithcode.LICENSE.txt (BSD family)
|
||||
arithcode
|
||||
|
BIN
lib-src/ant-tools-1.8.0-sources.jar
Normal file
BIN
lib-src/ant-tools-1.8.0-sources.jar
Normal file
Binary file not shown.
BIN
lib-src/vicino-1.1-sources.jar
Normal file
BIN
lib-src/vicino-1.1-sources.jar
Normal file
Binary file not shown.
BIN
lib/ant-tools-1.8.0.jar
Normal file
BIN
lib/ant-tools-1.8.0.jar
Normal file
Binary file not shown.
Binary file not shown.
BIN
lib/vicino-1.1.jar
Normal file
BIN
lib/vicino-1.1.jar
Normal file
Binary file not shown.
@ -37,6 +37,7 @@ import com.metaweb.util.threads.ThreadPoolExecutorAdapter;
|
||||
public class Gridworks {
|
||||
|
||||
static private String version;
|
||||
static private File tempDir;
|
||||
|
||||
private static Logger root = Logger.getRootLogger();
|
||||
private static Logger logger = Logger.getLogger("com.metaweb.gridworks");
|
||||
@ -65,6 +66,10 @@ public class Gridworks {
|
||||
return version;
|
||||
}
|
||||
|
||||
public static File getTempFile(String name) {
|
||||
return new File(tempDir, name);
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
// tell jetty to use SLF4J for logging instead of its own stuff
|
||||
@ -85,6 +90,9 @@ public class Gridworks {
|
||||
|
||||
version = Configurations.get("gridworks.version","trunk");
|
||||
|
||||
tempDir = new File(Configurations.get("gridworks.temp","temp"));
|
||||
if (!tempDir.exists()) tempDir.mkdirs();
|
||||
|
||||
Gridworks gridworks = new Gridworks();
|
||||
|
||||
gridworks.init(args);
|
||||
|
@ -1,6 +1,9 @@
|
||||
package com.metaweb.gridworks.commands.edit;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
@ -9,12 +12,24 @@ import java.io.StringReader;
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.net.URL;
|
||||
import java.net.URLConnection;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Properties;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
|
||||
import javax.servlet.ServletException;
|
||||
import javax.servlet.http.HttpServletRequest;
|
||||
import javax.servlet.http.HttpServletResponse;
|
||||
|
||||
import org.apache.tools.bzip2.CBZip2InputStream;
|
||||
import org.apache.tools.tar.TarEntry;
|
||||
import org.apache.tools.tar.TarInputStream;
|
||||
|
||||
import com.ibm.icu.text.CharsetDetector;
|
||||
import com.ibm.icu.text.CharsetMatch;
|
||||
import com.metaweb.gridworks.Gridworks;
|
||||
@ -91,7 +106,9 @@ public class CreateProjectCommand extends Command {
|
||||
while ((part = parser.readNextPart()) != null) {
|
||||
|
||||
if (part.isFile()) {
|
||||
internalImportFilePart((FilePart) part, project, options);
|
||||
|
||||
FilePart filePart = (FilePart) part;
|
||||
internalImportFile(project, options, filePart.getFileName(), filePart.getInputStream());
|
||||
|
||||
} else if (part.isParam()) {
|
||||
ParamPart paramPart = (ParamPart) part;
|
||||
@ -118,15 +135,120 @@ public class CreateProjectCommand extends Command {
|
||||
}
|
||||
}
|
||||
|
||||
protected void internalImportFilePart(
|
||||
FilePart filePart,
|
||||
protected void internalImportFile(
|
||||
Project project,
|
||||
Properties options
|
||||
Properties options,
|
||||
String fileName,
|
||||
InputStream inputStream
|
||||
) throws Exception {
|
||||
|
||||
Importer importer = guessImporter(options, null, filePart.getFileName());
|
||||
if (fileName.endsWith(".tar.gz") || fileName.endsWith(".tar.bz2")) {
|
||||
// first, save the file on disk, since we need two passes and we might
|
||||
// not have enough memory to keep it all in there
|
||||
File file = save(inputStream);
|
||||
|
||||
internalInvokeImporter(project, importer, options, filePart.getInputStream(), null);
|
||||
// in the first pass, gather statistics about what files are in there
|
||||
// unfortunately, we have to rely on files extensions, which is horrible but
|
||||
// better than nothing
|
||||
BufferedInputStream stream = new BufferedInputStream(new FileInputStream(file));
|
||||
InputStream is = (fileName.endsWith(".tar.gz")) ? new GZIPInputStream(stream): new CBZip2InputStream(stream);
|
||||
TarInputStream tis = new TarInputStream(is);
|
||||
HashMap<String,Integer> ext_map = new HashMap<String,Integer>();
|
||||
while (true) {
|
||||
TarEntry entry = tis.getNextEntry();
|
||||
if (entry == null) break;
|
||||
if (!entry.isDirectory()) {
|
||||
String name = entry.getName();
|
||||
String ext = getExtension(name)[1];
|
||||
if (ext_map.containsKey(ext)) {
|
||||
ext_map.put(ext, ext_map.get(ext) + 1);
|
||||
} else {
|
||||
ext_map.put(ext, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
stream.close();
|
||||
|
||||
// sort extensions by how often they appear
|
||||
List<Entry<String,Integer>> values = new ArrayList<Entry<String,Integer>>(ext_map.entrySet());
|
||||
Collections.sort(values, new ValuesComparator());
|
||||
|
||||
if (values.size() == 0) {
|
||||
throw new RuntimeException("The archive contains no files.");
|
||||
}
|
||||
|
||||
// this will contain the set of extensions we'll load from the archive
|
||||
HashSet<String> exts = new HashSet<String>();
|
||||
|
||||
// find the extension that is most frequent or those who share the highest frequency value
|
||||
Entry<String,Integer> most_frequent = values.get(0);
|
||||
Entry<String,Integer> second_most_frequent = values.get(1);
|
||||
if (most_frequent.getValue() > second_most_frequent.getValue()) { // we have a winner
|
||||
exts.add(most_frequent.getKey());
|
||||
} else { // multiple extensions have the same frequency
|
||||
int winning_frequency = most_frequent.getValue();
|
||||
for (Entry<String,Integer> e : values) {
|
||||
if (e.getValue() == winning_frequency) {
|
||||
exts.add(e.getKey());
|
||||
}
|
||||
}
|
||||
}
|
||||
Gridworks.log("Most frequent extensions: " + exts.toString());
|
||||
|
||||
|
||||
} else if (fileName.endsWith(".zip")) {
|
||||
|
||||
} else if (fileName.endsWith(".gz")) {
|
||||
String[] frags = getExtension(fileName);
|
||||
internalImportFile(project, options, frags[0], new GZIPInputStream(inputStream));
|
||||
} else if (fileName.endsWith(".bz2")) {
|
||||
String[] frags = getExtension(fileName);
|
||||
internalImportFile(project, options, frags[0], new CBZip2InputStream(inputStream));
|
||||
} else {
|
||||
load(project, options, fileName, inputStream);
|
||||
}
|
||||
}
|
||||
|
||||
public class ValuesComparator implements Comparator<Entry<String,Integer>> {
|
||||
public int compare(Entry<String,Integer> o1, Entry<String,Integer> o2) {
|
||||
return o2.getValue() - o1.getValue();
|
||||
}
|
||||
}
|
||||
|
||||
private void load(Project project, Properties options, String fileName, InputStream inputStream) throws Exception {
|
||||
Importer importer = guessImporter(options, null, fileName);
|
||||
internalInvokeImporter(project, importer, options, inputStream, null);
|
||||
inputStream.close();
|
||||
}
|
||||
|
||||
private File save(InputStream is) throws IOException {
|
||||
File temp = Gridworks.getTempFile(Long.toString(System.currentTimeMillis()));
|
||||
temp.deleteOnExit();
|
||||
copy(is,temp);
|
||||
is.close();
|
||||
return temp;
|
||||
}
|
||||
|
||||
private String[] getExtension(String filename) {
|
||||
String[] result = new String[2];
|
||||
int ext_index = filename.lastIndexOf(".");
|
||||
result[0] = (ext_index == -1) ? filename : filename.substring(0,ext_index);
|
||||
result[1] = (ext_index == -1) ? "" : filename.substring(ext_index + 1);
|
||||
return result;
|
||||
}
|
||||
|
||||
private static long copy(InputStream input, File file) throws IOException {
|
||||
FileOutputStream output = new FileOutputStream(file);
|
||||
byte[] buffer = new byte[4 * 1024];
|
||||
long count = 0;
|
||||
int n = 0;
|
||||
while (-1 != (n = input.read(buffer))) {
|
||||
output.write(buffer, 0, n);
|
||||
count += n;
|
||||
}
|
||||
output.close();
|
||||
input.close();
|
||||
return count;
|
||||
}
|
||||
|
||||
protected void internalImportURL(
|
||||
@ -237,17 +359,9 @@ public class CreateProjectCommand extends Command {
|
||||
new InputStreamReader(inputStream);
|
||||
}
|
||||
|
||||
try {
|
||||
importer.read(reader, project, options, skip, limit);
|
||||
} finally {
|
||||
reader.close();
|
||||
}
|
||||
} else {
|
||||
try {
|
||||
importer.read(inputStream, project, options, skip, limit);
|
||||
} finally {
|
||||
inputStream.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -295,6 +409,12 @@ public class CreateProjectCommand extends Command {
|
||||
return new ExcelImporter(false);
|
||||
} else if("application/x-xls".equals(contentType)) {
|
||||
return new ExcelImporter(true);
|
||||
} else if("application/xml".equals(contentType) ||
|
||||
"text/xml".equals(contentType) ||
|
||||
"application/rss+xml".equals(contentType) ||
|
||||
"application/atom+xml".equals(contentType) ||
|
||||
"application/rdf+xml".equals(contentType)) {
|
||||
return new XmlImporter();
|
||||
}
|
||||
} else if (fileName != null) {
|
||||
fileName = fileName.toLowerCase();
|
||||
|
@ -1,79 +0,0 @@
|
||||
package edu.mit.simile.vicino;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import edu.mit.simile.vicino.clustering.Clusterer;
|
||||
import edu.mit.simile.vicino.clustering.NGramClusterer;
|
||||
import edu.mit.simile.vicino.clustering.VPTreeClusterer;
|
||||
import edu.mit.simile.vicino.distances.Distance;
|
||||
|
||||
public class Cluster extends Operator {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
(new Cluster()).init(args);
|
||||
}
|
||||
|
||||
public void init(String[] args) throws Exception {
|
||||
Distance distance = getDistance(args[0]);
|
||||
List<String> strings = getStrings(args[1]);
|
||||
double radius = Double.parseDouble(args[2]);
|
||||
int blocking_size = Integer.parseInt(args[3]);
|
||||
|
||||
long vptree_start = System.currentTimeMillis();
|
||||
Clusterer vptree_clusterer = new VPTreeClusterer(distance);
|
||||
for (String s: strings) {
|
||||
vptree_clusterer.populate(s);
|
||||
}
|
||||
List<Set<Serializable>> vptree_clusters = vptree_clusterer.getClusters(radius);
|
||||
long vptree_elapsed = System.currentTimeMillis() - vptree_start;
|
||||
int vptree_distances = distance.getCount();
|
||||
distance.resetCounter();
|
||||
|
||||
long ngram_start = System.currentTimeMillis();
|
||||
Clusterer ngram_clusterer = new NGramClusterer(distance,blocking_size);
|
||||
for (String s: strings) {
|
||||
ngram_clusterer.populate(s);
|
||||
}
|
||||
List<Set<Serializable>> ngram_clusters = ngram_clusterer.getClusters(radius);
|
||||
long ngram_elapsed = System.currentTimeMillis() - ngram_start;
|
||||
int ngram_distances = distance.getCount();
|
||||
distance.resetCounter();
|
||||
|
||||
log("VPTree found " + vptree_clusters.size() + " in " + vptree_elapsed + " ms with " + vptree_distances + " distances\n");
|
||||
log("NGram found " + ngram_clusters.size() + " in " + ngram_elapsed + " ms with " + ngram_distances + " distances\n");
|
||||
|
||||
if (vptree_clusters.size() > ngram_clusters.size()) {
|
||||
log("VPTree clusterer found these clusters the other method couldn't: ");
|
||||
diff(vptree_clusters,ngram_clusters);
|
||||
} else if (ngram_clusters.size() > vptree_clusters.size()) {
|
||||
log("NGram clusterer found these clusters the other method couldn't: ");
|
||||
diff(ngram_clusters,vptree_clusters);
|
||||
}
|
||||
|
||||
System.exit(0);
|
||||
}
|
||||
|
||||
private void diff(List<Set<Serializable>> more, List<Set<Serializable>> base) {
|
||||
Set<Set<Serializable>> holder = new HashSet<Set<Serializable>>(base.size());
|
||||
|
||||
for (Set<Serializable> s : base) {
|
||||
holder.add(s);
|
||||
}
|
||||
|
||||
for (Set<Serializable> s : more) {
|
||||
if (!holder.contains(s)) {
|
||||
printCluster(s);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void printCluster(Set<Serializable> cluster) {
|
||||
for (Serializable s : cluster) {
|
||||
log(s.toString());
|
||||
}
|
||||
log("");
|
||||
}
|
||||
}
|
@ -1,61 +0,0 @@
|
||||
package edu.mit.simile.vicino;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import edu.mit.simile.vicino.distances.Distance;
|
||||
|
||||
public class Distributor extends Operator {
|
||||
|
||||
private static final int COLUMNS = 70;
|
||||
private static final char CHAR = '=';
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
Distance d = getDistance(args[0]);
|
||||
|
||||
List<String> strings = getStrings(args[1]);
|
||||
|
||||
int buckets = Integer.parseInt(args[2]);
|
||||
|
||||
long start = System.currentTimeMillis();
|
||||
int[] values = new int[buckets];
|
||||
|
||||
int size = strings.size();
|
||||
for (int i = 0; i < size; i++) {
|
||||
String x = (String) strings.get((int) (Math.random() * size));
|
||||
String y = (String) strings.get((int) (Math.random() * size));
|
||||
int dist = (int) (buckets * d.d(x, y));
|
||||
values[dist]++;
|
||||
System.out.print(".");
|
||||
}
|
||||
System.out.println();
|
||||
|
||||
long stop = System.currentTimeMillis();
|
||||
float m = ((float) (stop - start)) / (float) size;
|
||||
|
||||
int maxValue = 0;
|
||||
for (int i = 0; i < buckets; i++) {
|
||||
if (values[i] > maxValue) {
|
||||
maxValue = values[i];
|
||||
}
|
||||
}
|
||||
|
||||
System.out
|
||||
.println("+-------------------------------------------------------------------");
|
||||
for (int i = 0; i < buckets; i++) {
|
||||
System.out.println("|" + bar(COLUMNS * values[i] / maxValue));
|
||||
}
|
||||
System.out
|
||||
.println("+-------------------------------------------------------------------");
|
||||
|
||||
System.out.println("\n Each distance calculation took: " + m + " millis");
|
||||
}
|
||||
|
||||
static private String bar(int value) {
|
||||
StringBuffer b = new StringBuffer(value);
|
||||
for (int i = 0; i < value; i++) {
|
||||
b.append(CHAR);
|
||||
}
|
||||
return b.toString();
|
||||
}
|
||||
}
|
@ -1,12 +0,0 @@
|
||||
package edu.mit.simile.vicino;
|
||||
|
||||
import edu.mit.simile.vicino.distances.Distance;
|
||||
|
||||
public class Meter extends Operator {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
Distance d = getDistance(args[0]);
|
||||
System.out.println(args[1] + " <- " + d.d(args[1], args[2]) + " -> " + args[2]);
|
||||
}
|
||||
|
||||
}
|
@ -1,94 +0,0 @@
|
||||
package edu.mit.simile.vicino;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.TreeMap;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import com.wcohen.ss.api.Token;
|
||||
import com.wcohen.ss.api.Tokenizer;
|
||||
|
||||
public class NGramTokenizer implements Tokenizer {
|
||||
|
||||
private int ngram_size;
|
||||
|
||||
public NGramTokenizer(int ngram_size) {
|
||||
this.ngram_size = ngram_size;
|
||||
}
|
||||
|
||||
public Token[] tokenize(String str) {
|
||||
str = normalize(str);
|
||||
List<Token> tokens = new ArrayList<Token>();
|
||||
for (int i = 0; i < str.length(); i++) {
|
||||
int index = i + ngram_size;
|
||||
if (index <= str.length()) {
|
||||
tokens.add(intern(str.substring(i,index)));
|
||||
}
|
||||
}
|
||||
return (Token[]) tokens.toArray(new BasicToken[tokens.size()]);
|
||||
}
|
||||
|
||||
static final Pattern extra = Pattern.compile("\\p{Cntrl}|\\p{Punct}");
|
||||
static final Pattern whitespace = Pattern.compile("\\p{Space}+");
|
||||
|
||||
private String normalize(String s) {
|
||||
s = s.trim();
|
||||
s = extra.matcher(s).replaceAll("");
|
||||
s = whitespace.matcher(s).replaceAll(" ");
|
||||
s = s.toLowerCase();
|
||||
return s.intern();
|
||||
}
|
||||
|
||||
private int nextId = 0;
|
||||
private Map<String, Token> tokMap = new TreeMap<String, Token>();
|
||||
|
||||
public Token intern(String s) {
|
||||
s = s.toLowerCase().intern();
|
||||
Token tok = tokMap.get(s);
|
||||
if (tok == null) {
|
||||
tok = new BasicToken(++nextId, s);
|
||||
tokMap.put(s, tok);
|
||||
}
|
||||
return tok;
|
||||
}
|
||||
|
||||
public Iterator<Token> tokenIterator() {
|
||||
return tokMap.values().iterator();
|
||||
}
|
||||
|
||||
public int maxTokenIndex() {
|
||||
return nextId;
|
||||
}
|
||||
|
||||
public class BasicToken implements Token, Comparable<Token> {
|
||||
private final int index;
|
||||
private final String value;
|
||||
|
||||
BasicToken(int index, String value) {
|
||||
this.index = index;
|
||||
this.value = value;
|
||||
}
|
||||
|
||||
public String getValue() {
|
||||
return value;
|
||||
}
|
||||
|
||||
public int getIndex() {
|
||||
return index;
|
||||
}
|
||||
|
||||
public int compareTo(Token t) {
|
||||
return index - t.getIndex();
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
return value.hashCode();
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "[token#" + getIndex() + ":" + getValue() + "]";
|
||||
}
|
||||
}
|
||||
}
|
@ -1,47 +0,0 @@
|
||||
package edu.mit.simile.vicino;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import edu.mit.simile.vicino.distances.Distance;
|
||||
|
||||
public class Operator {
|
||||
|
||||
static void log(String msg) {
|
||||
System.out.println(msg);
|
||||
}
|
||||
|
||||
static Distance getDistance(String distance) throws Exception {
|
||||
return (Distance) Class.forName("edu.mit.simile.vicino.distances." + distance + "Distance").newInstance();
|
||||
}
|
||||
|
||||
static List<String> getStrings(String fileName) throws IOException {
|
||||
List<String> strings = new ArrayList<String>();
|
||||
|
||||
File file = new File(fileName);
|
||||
if (file.isDirectory()) {
|
||||
File[] files = file.listFiles();
|
||||
for (File f : files) {
|
||||
getStrings(f, strings);
|
||||
}
|
||||
} else {
|
||||
getStrings(file, strings);
|
||||
}
|
||||
|
||||
return strings;
|
||||
}
|
||||
|
||||
static void getStrings(File file, List<String> strings) throws IOException {
|
||||
BufferedReader input = new BufferedReader(new InputStreamReader(new FileInputStream(file), "UTF-8"));
|
||||
String line;
|
||||
while ((line = input.readLine()) != null) {
|
||||
strings.add(line.trim().intern());
|
||||
}
|
||||
input.close();
|
||||
}
|
||||
}
|
@ -1,53 +0,0 @@
|
||||
package edu.mit.simile.vicino;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Serializable;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import edu.mit.simile.vicino.distances.Distance;
|
||||
import edu.mit.simile.vicino.vptree.VPTree;
|
||||
import edu.mit.simile.vicino.vptree.VPTreeBuilder;
|
||||
import edu.mit.simile.vicino.vptree.VPTreeSeeker;
|
||||
|
||||
public class Seeker extends Operator {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
Distance d = getDistance(args[0]);
|
||||
|
||||
log("Working with distance: " + d);
|
||||
List<String> strings = getStrings(args[1]);
|
||||
log("Obtained " + strings.size() + " from " + args[1]);
|
||||
|
||||
log("Building VPTree...");
|
||||
VPTreeBuilder builder = new VPTreeBuilder(d);
|
||||
VPTree tree = builder.buildVPTree(strings);
|
||||
log("..done");
|
||||
|
||||
VPTreeSeeker seeker = new VPTreeSeeker(d, tree);
|
||||
|
||||
log("type a string|range then hit return:");
|
||||
BufferedReader input = new BufferedReader(new InputStreamReader(System.in));
|
||||
String line = null;
|
||||
while ((line = input.readLine()) != null) {
|
||||
int index = line.indexOf('|');
|
||||
String query = line.substring(0, index);
|
||||
float range = Float.parseFloat(line.substring(index + 1));
|
||||
long start = System.currentTimeMillis();
|
||||
Set<Serializable> results = seeker.range(query, range);
|
||||
long stop = System.currentTimeMillis();
|
||||
Iterator<Serializable> j = results.iterator();
|
||||
if (j.hasNext()) {
|
||||
while (j.hasNext()) {
|
||||
String r = (String) j.next();
|
||||
log(" " + r);
|
||||
}
|
||||
log(" [done in " + (stop - start) + "ms]");
|
||||
} else {
|
||||
log(" [no results found in " + (stop - start) + "ms]");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -1,46 +0,0 @@
|
||||
package edu.mit.simile.vicino;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import edu.mit.simile.vicino.distances.Distance;
|
||||
|
||||
public class Tester extends Operator {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
Distance d = getDistance(args[0]);
|
||||
|
||||
List<String> strings = getStrings(args[1]);
|
||||
|
||||
long start = System.currentTimeMillis();
|
||||
|
||||
int size = strings.size();
|
||||
for (int i = 0; i < size * size * size; i++) {
|
||||
String x = (String) strings.get((int) (Math.random() * size));
|
||||
String y = (String) strings.get((int) (Math.random() * size));
|
||||
String z = (String) strings.get((int) (Math.random() * size));
|
||||
boolean metric = metric(x, y, z, d);
|
||||
if (metric) {
|
||||
System.out.println("metric");
|
||||
} else {
|
||||
System.out.println("***** NOT METRIC *****");
|
||||
}
|
||||
}
|
||||
|
||||
long stop = System.currentTimeMillis();
|
||||
float m = ((float) (stop - start)) / (float) size;
|
||||
|
||||
System.out.println("\n Each metric evaluation took: " + m + " millis");
|
||||
}
|
||||
|
||||
static boolean metric(String x, String y, String z, Distance d) {
|
||||
double dxx = d.d(x, x);
|
||||
boolean identity = (dxx == 0.0f);
|
||||
double dxy = d.d(x, y);
|
||||
double dyx = d.d(y, x);
|
||||
boolean simmetrical = (dxy == dyx);
|
||||
double dxz = d.d(x, z);
|
||||
double dyz = d.d(y, z);
|
||||
boolean triangular = (dxz <= dxy + dyz);
|
||||
return (identity && simmetrical && triangular);
|
||||
}
|
||||
}
|
@ -1,20 +0,0 @@
|
||||
package edu.mit.simile.vicino.clustering;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
public abstract class Clusterer {
|
||||
|
||||
public class SizeComparator implements Comparator<Set<Serializable>> {
|
||||
public int compare(Set<Serializable> o1, Set<Serializable> o2) {
|
||||
return o2.size() - o1.size();
|
||||
}
|
||||
}
|
||||
|
||||
public abstract void populate(String s);
|
||||
|
||||
public abstract List<Set<Serializable>> getClusters(double radius);
|
||||
|
||||
}
|
@ -1,194 +0,0 @@
|
||||
package edu.mit.simile.vicino.clustering;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.concurrent.Callable;
|
||||
import java.util.concurrent.ExecutionException;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.Future;
|
||||
|
||||
import com.wcohen.ss.api.Token;
|
||||
|
||||
import edu.mit.simile.vicino.NGramTokenizer;
|
||||
import edu.mit.simile.vicino.distances.Distance;
|
||||
|
||||
public class NGramClusterer extends Clusterer {
|
||||
|
||||
NGramTokenizer _tokenizer;
|
||||
Distance _distance;
|
||||
|
||||
Map<String,Set<String>> blocks = new HashMap<String,Set<String>>();
|
||||
|
||||
public NGramClusterer(Distance d, int blockSize) {
|
||||
_tokenizer = new NGramTokenizer(blockSize);
|
||||
_distance = d;
|
||||
}
|
||||
|
||||
public void populate(String s) {
|
||||
Token[] tokens = _tokenizer.tokenize(s);
|
||||
for (Token t : tokens) {
|
||||
String ss = t.getValue();
|
||||
Set<String> l = null;
|
||||
if (!blocks.containsKey(ss)) {
|
||||
l = new TreeSet<String>();
|
||||
blocks.put(ss, l);
|
||||
} else {
|
||||
l = blocks.get(ss);
|
||||
}
|
||||
l.add(s);
|
||||
}
|
||||
}
|
||||
|
||||
public class BlockEvaluator implements Callable<Map<Serializable,Set<Serializable>>> {
|
||||
|
||||
int start;
|
||||
int stop;
|
||||
double radius;
|
||||
|
||||
List<Set<String>> blocks;
|
||||
Map<Serializable,Set<Serializable>> cluster_map;
|
||||
|
||||
public BlockEvaluator(List<Set<String>> blocks, double radius, int start, int stop) {
|
||||
this.blocks = blocks;
|
||||
this.start = start;
|
||||
this.stop = stop;
|
||||
this.radius = radius;
|
||||
}
|
||||
|
||||
public Map<Serializable,Set<Serializable>> call() {
|
||||
Map<Serializable,Set<Serializable>> cluster_map = new HashMap<Serializable,Set<Serializable>>();
|
||||
|
||||
for (int i = start; i < stop; i++) {
|
||||
Set<String> set = blocks.get(i);
|
||||
if (set.size() < 2) continue;
|
||||
for (String a : set) {
|
||||
for (String b : set) {
|
||||
if (a == b) continue;
|
||||
if (cluster_map.containsKey(a) && cluster_map.get(a).contains(b)) continue;
|
||||
if (cluster_map.containsKey(b) && cluster_map.get(b).contains(a)) continue;
|
||||
double d = _distance.d(a,b);
|
||||
if (d <= radius || radius < 0) {
|
||||
Set<Serializable> l = null;
|
||||
if (!cluster_map.containsKey(a)) {
|
||||
l = new TreeSet<Serializable>();
|
||||
l.add(a);
|
||||
cluster_map.put(a, l);
|
||||
} else {
|
||||
l = cluster_map.get(a);
|
||||
}
|
||||
l.add(b);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return cluster_map;
|
||||
}
|
||||
}
|
||||
|
||||
private static final ExecutorService executor = Executors.newCachedThreadPool();
|
||||
|
||||
private static final boolean MULTITHREADED = true;
|
||||
|
||||
public List<Set<Serializable>> getClusters(double radius) {
|
||||
if (MULTITHREADED) {
|
||||
return getClustersMultiThread(radius);
|
||||
} else {
|
||||
return getClustersSingleThread(radius);
|
||||
}
|
||||
}
|
||||
|
||||
public List<Set<Serializable>> getClustersMultiThread(double radius) {
|
||||
|
||||
int cores = Runtime.getRuntime().availableProcessors();
|
||||
int size = blocks.size();
|
||||
int range = size / cores + 1;
|
||||
|
||||
List<Map<Serializable,Set<Serializable>>> cluster_maps = new ArrayList<Map<Serializable,Set<Serializable>>>(cores);
|
||||
|
||||
List<BlockEvaluator> evaluators = new ArrayList<BlockEvaluator>(cores);
|
||||
for (int i = 0; i < cores; i++) {
|
||||
int range_start = range * i;
|
||||
int range_end = range * (i + 1);
|
||||
if (range_end > size) range_end = size;
|
||||
evaluators.add(new BlockEvaluator(new ArrayList<Set<String>>(blocks.values()),radius,range_start,range_end));
|
||||
}
|
||||
|
||||
try {
|
||||
List<Future<Map<Serializable,Set<Serializable>>>> futures = executor.invokeAll(evaluators);
|
||||
for (Future<Map<Serializable,Set<Serializable>>> future : futures) {
|
||||
cluster_maps.add(future.get());
|
||||
}
|
||||
} catch (InterruptedException e1) {
|
||||
e1.printStackTrace();
|
||||
} catch (ExecutionException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
||||
Set<Set<Serializable>> clusters = new HashSet<Set<Serializable>>();
|
||||
|
||||
for (Map<Serializable,Set<Serializable>> cluster_map : cluster_maps) {
|
||||
for (Entry<Serializable,Set<Serializable>> e : cluster_map.entrySet()) {
|
||||
Set<Serializable> v = e.getValue();
|
||||
if (v.size() > 1) {
|
||||
clusters.add(v);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
List<Set<Serializable>> sorted_clusters = new ArrayList<Set<Serializable>>(clusters);
|
||||
|
||||
Collections.sort(sorted_clusters, new SizeComparator());
|
||||
|
||||
return sorted_clusters;
|
||||
}
|
||||
|
||||
public List<Set<Serializable>> getClustersSingleThread(double radius) {
|
||||
|
||||
Map<Serializable,Set<Serializable>> cluster_map = new HashMap<Serializable,Set<Serializable>>();
|
||||
|
||||
for (Set<String> set : blocks.values()) {
|
||||
if (set.size() < 2) continue;
|
||||
for (String a : set) {
|
||||
for (String b : set) {
|
||||
if (a == b) continue;
|
||||
if (cluster_map.containsKey(a) && cluster_map.get(a).contains(b)) continue;
|
||||
if (cluster_map.containsKey(b) && cluster_map.get(b).contains(a)) continue;
|
||||
double d = _distance.d(a,b);
|
||||
if (d <= radius || radius < 0) {
|
||||
Set<Serializable> l = null;
|
||||
if (!cluster_map.containsKey(a)) {
|
||||
l = new TreeSet<Serializable>();
|
||||
l.add(a);
|
||||
cluster_map.put(a, l);
|
||||
} else {
|
||||
l = cluster_map.get(a);
|
||||
}
|
||||
l.add(b);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
List<Set<Serializable>> clusters = new ArrayList<Set<Serializable>>();
|
||||
for (Entry<Serializable,Set<Serializable>> e : cluster_map.entrySet()) {
|
||||
Set<Serializable> v = e.getValue();
|
||||
if (v.size() > 1) {
|
||||
clusters.add(v);
|
||||
}
|
||||
}
|
||||
Collections.sort(clusters, new SizeComparator());
|
||||
|
||||
return clusters;
|
||||
}
|
||||
|
||||
}
|
@ -1,63 +0,0 @@
|
||||
package edu.mit.simile.vicino.clustering;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import edu.mit.simile.vicino.distances.Distance;
|
||||
import edu.mit.simile.vicino.vptree.Node;
|
||||
import edu.mit.simile.vicino.vptree.VPTree;
|
||||
import edu.mit.simile.vicino.vptree.VPTreeBuilder;
|
||||
import edu.mit.simile.vicino.vptree.VPTreeSeeker;
|
||||
|
||||
public class VPTreeClusterer extends Clusterer {
|
||||
|
||||
VPTreeBuilder _treeBuilder;
|
||||
Distance _distance;
|
||||
|
||||
public VPTreeClusterer(Distance d) {
|
||||
_distance = d;
|
||||
_treeBuilder = new VPTreeBuilder(d);
|
||||
}
|
||||
|
||||
public void populate(String s) {
|
||||
_treeBuilder.populate(s);
|
||||
}
|
||||
|
||||
public List<Set<Serializable>> getClusters(double radius) {
|
||||
VPTree tree = _treeBuilder.buildVPTree();
|
||||
System.out.println("distances after the tree: " + _distance.getCount());
|
||||
Set<Node> nodes = _treeBuilder.getNodes();
|
||||
|
||||
VPTreeSeeker seeker = new VPTreeSeeker(_distance,tree);
|
||||
Map<Serializable,Boolean> flags = new HashMap<Serializable,Boolean>();
|
||||
for (Node n : nodes) {
|
||||
flags.put(n.get(), true);
|
||||
}
|
||||
|
||||
Map<Serializable,Set<Serializable>> map = new HashMap<Serializable,Set<Serializable>>();
|
||||
for (Node n : nodes) {
|
||||
Serializable s = n.get();
|
||||
if (flags.get(s)) {
|
||||
Set<Serializable> results = seeker.range(s, radius);
|
||||
for (Serializable ss : results) {
|
||||
flags.put(ss, false);
|
||||
}
|
||||
if (results.size() > 1) {
|
||||
map.put(s, results);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
List<Set<Serializable>> clusters = new ArrayList<Set<Serializable>>(map.values());
|
||||
Collections.sort(clusters, new SizeComparator());
|
||||
|
||||
return clusters;
|
||||
}
|
||||
|
||||
|
||||
}
|
@ -1,26 +0,0 @@
|
||||
package edu.mit.simile.vicino.distances;
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.tools.bzip2.CBZip2OutputStream;
|
||||
|
||||
public class BZip2Distance extends PseudoMetricDistance {
|
||||
|
||||
public double d2(String x, String y) {
|
||||
String str = x + y;
|
||||
double result = 0.0f;
|
||||
try {
|
||||
ByteArrayOutputStream baos = new ByteArrayOutputStream(str.length());
|
||||
CBZip2OutputStream os = new CBZip2OutputStream(baos);
|
||||
os.write(str.getBytes());
|
||||
os.close();
|
||||
baos.close();
|
||||
result = baos.toByteArray().length;
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
}
|
@ -1,17 +0,0 @@
|
||||
package edu.mit.simile.vicino.distances;
|
||||
|
||||
public abstract class Distance {
|
||||
|
||||
int counter = 0;
|
||||
|
||||
public int getCount() {
|
||||
return counter;
|
||||
}
|
||||
|
||||
public void resetCounter() {
|
||||
counter = 0;
|
||||
}
|
||||
|
||||
public abstract double d(String x, String y);
|
||||
|
||||
}
|
@ -1,26 +0,0 @@
|
||||
package edu.mit.simile.vicino.distances;
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.zip.GZIPOutputStream;
|
||||
|
||||
public class GZipDistance extends PseudoMetricDistance {
|
||||
|
||||
public double d2(String x, String y) {
|
||||
String str = x + y;
|
||||
double result = 0.0f;
|
||||
try {
|
||||
ByteArrayOutputStream baos = new ByteArrayOutputStream(str.length());
|
||||
GZIPOutputStream os = new GZIPOutputStream(baos);
|
||||
os.write(str.getBytes());
|
||||
os.close();
|
||||
baos.close();
|
||||
result = baos.toByteArray().length;
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
return result;
|
||||
|
||||
}
|
||||
|
||||
}
|
@ -1,18 +0,0 @@
|
||||
package edu.mit.simile.vicino.distances;
|
||||
|
||||
import com.wcohen.ss.Jaccard;
|
||||
import com.wcohen.ss.api.StringDistance;
|
||||
|
||||
public class JaccardDistance extends MetricDistance {
|
||||
|
||||
StringDistance distance;
|
||||
|
||||
public JaccardDistance() {
|
||||
this.distance = new Jaccard();
|
||||
}
|
||||
|
||||
protected double d2(String x, String y) {
|
||||
return this.distance.score(x, y);
|
||||
}
|
||||
|
||||
}
|
@ -1,18 +0,0 @@
|
||||
package edu.mit.simile.vicino.distances;
|
||||
|
||||
import com.wcohen.ss.Jaro;
|
||||
import com.wcohen.ss.api.StringDistance;
|
||||
|
||||
public class JaroDistance extends MetricDistance {
|
||||
|
||||
StringDistance distance;
|
||||
|
||||
public JaroDistance() {
|
||||
this.distance = new Jaro();
|
||||
}
|
||||
|
||||
protected double d2(String x, String y) {
|
||||
return this.distance.score(x, y);
|
||||
}
|
||||
|
||||
}
|
@ -1,18 +0,0 @@
|
||||
package edu.mit.simile.vicino.distances;
|
||||
|
||||
import com.wcohen.ss.JaroWinkler;
|
||||
import com.wcohen.ss.api.StringDistance;
|
||||
|
||||
public class JaroWinklerDistance extends MetricDistance {
|
||||
|
||||
StringDistance distance;
|
||||
|
||||
public JaroWinklerDistance() {
|
||||
this.distance = new JaroWinkler();
|
||||
}
|
||||
|
||||
protected double d2(String x, String y) {
|
||||
return this.distance.score(x, y);
|
||||
}
|
||||
|
||||
}
|
@ -1,18 +0,0 @@
|
||||
package edu.mit.simile.vicino.distances;
|
||||
|
||||
import com.wcohen.ss.JaroWinklerTFIDF;
|
||||
import com.wcohen.ss.api.StringDistance;
|
||||
|
||||
public class JaroWinklerTFIDFDistance extends MetricDistance {
|
||||
|
||||
StringDistance distance;
|
||||
|
||||
public JaroWinklerTFIDFDistance() {
|
||||
this.distance = new JaroWinklerTFIDF();
|
||||
}
|
||||
|
||||
protected double d2(String x, String y) {
|
||||
return this.distance.score(x, y);
|
||||
}
|
||||
|
||||
}
|
@ -1,18 +0,0 @@
|
||||
package edu.mit.simile.vicino.distances;
|
||||
|
||||
import com.wcohen.ss.Levenstein;
|
||||
import com.wcohen.ss.api.StringDistance;
|
||||
|
||||
public class LevenshteinDistance extends MetricDistance {
|
||||
|
||||
StringDistance distance;
|
||||
|
||||
public LevenshteinDistance() {
|
||||
this.distance = new Levenstein();
|
||||
}
|
||||
|
||||
public double d2(String x, String y) {
|
||||
return Math.abs(this.distance.score(x, y));
|
||||
}
|
||||
|
||||
}
|
@ -1,24 +0,0 @@
|
||||
package edu.mit.simile.vicino.distances;
|
||||
|
||||
|
||||
public abstract class MetricDistance extends Distance {
|
||||
|
||||
/*
|
||||
* public float d(String x,String y) {
|
||||
* float dxy = d2(x,y);
|
||||
* float dx = d2(x,"");
|
||||
* float dy = d2(y,"");
|
||||
* float result = dxy / (dx + dy);
|
||||
* return result;
|
||||
* }
|
||||
*/
|
||||
|
||||
public double d(String x, String y) {
|
||||
double result = d2(x, y);
|
||||
counter += 1;
|
||||
return result;
|
||||
}
|
||||
|
||||
abstract double d2(String x, String y);
|
||||
|
||||
}
|
@ -1,27 +0,0 @@
|
||||
package edu.mit.simile.vicino.distances;
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
|
||||
import com.colloquial.arithcode.ArithCodeOutputStream;
|
||||
import com.colloquial.arithcode.PPMModel;
|
||||
|
||||
public class PPMDistance extends PseudoMetricDistance {
|
||||
|
||||
public double d2(String x, String y) {
|
||||
String str = x + y;
|
||||
double result = 0.0f;
|
||||
try {
|
||||
ByteArrayOutputStream baos = new ByteArrayOutputStream(str.length());
|
||||
ArithCodeOutputStream os = new ArithCodeOutputStream(baos,new PPMModel(8));
|
||||
os.write(str.getBytes());
|
||||
os.close();
|
||||
baos.close();
|
||||
result = baos.toByteArray().length;
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
}
|
@ -1,16 +0,0 @@
|
||||
package edu.mit.simile.vicino.distances;
|
||||
|
||||
|
||||
public abstract class PseudoMetricDistance extends Distance {
|
||||
|
||||
public double d(String x, String y) {
|
||||
double cxx = d2(x, x);
|
||||
double cyy = d2(y, y);
|
||||
double cxy = d2(x, y);
|
||||
double cyx = d2(y, x);
|
||||
counter += 4;
|
||||
return 10.0d * ((cxy + cyx) / (cxx + cyy) - 1.0d);
|
||||
}
|
||||
|
||||
protected abstract double d2(String x, String y);
|
||||
}
|
@ -1,58 +0,0 @@
|
||||
package edu.mit.simile.vicino.vptree;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
/**
|
||||
* This class represent a couple (Object, distance) value of that Object from
|
||||
* the Vp in each step of the algorithm.
|
||||
*
|
||||
* @author Paolo Ciccarese
|
||||
*/
|
||||
public class Node implements Serializable {
|
||||
|
||||
private static final long serialVersionUID = -2077473220894258550L;
|
||||
|
||||
private final Serializable obj;
|
||||
private double distance;
|
||||
|
||||
public Node(Serializable obj, int i) {
|
||||
this.obj = obj;
|
||||
this.distance = i;
|
||||
}
|
||||
|
||||
public Node(Serializable obj) {
|
||||
this.obj = obj;
|
||||
}
|
||||
|
||||
public Serializable get() {
|
||||
return this.obj;
|
||||
}
|
||||
|
||||
public void setDistance(double distance) {
|
||||
this.distance = distance;
|
||||
}
|
||||
|
||||
public double getDistance() {
|
||||
return distance;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return obj.toString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) {
|
||||
return true;
|
||||
}
|
||||
if (o instanceof Node) {
|
||||
return ((Node) o).get().equals(this.obj);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return this.obj.hashCode();
|
||||
}
|
||||
}
|
@ -1,94 +0,0 @@
|
||||
package edu.mit.simile.vicino.vptree;
|
||||
|
||||
public class NodeSorter {
|
||||
|
||||
/**
|
||||
* Sorts and array of objects.
|
||||
*/
|
||||
public void sort(Node nodes[]) {
|
||||
NodeSorter.sort(nodes, 0, nodes.length - 1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Sort array of Objects using the QuickSort algorithm.
|
||||
*
|
||||
* @param s
|
||||
* An Object[].
|
||||
* @param lo
|
||||
* The current lower bound.
|
||||
* @param hi
|
||||
* The current upper bound.
|
||||
*/
|
||||
public static void sort(Node nodes[], int lo, int hi) {
|
||||
if (lo >= hi) {
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* Use median-of-three(lo, mid, hi) to pick a partition. Also swap them
|
||||
* into relative order while we are at it.
|
||||
*/
|
||||
int mid = (lo + hi) / 2;
|
||||
|
||||
if (nodes[lo].getDistance() > nodes[mid].getDistance()) {
|
||||
// Swap.
|
||||
Node tmp = nodes[lo];
|
||||
nodes[lo] = nodes[mid];
|
||||
nodes[mid] = tmp;
|
||||
}
|
||||
|
||||
if (nodes[mid].getDistance() > nodes[hi].getDistance()) {
|
||||
// Swap .
|
||||
Node tmp = nodes[mid];
|
||||
nodes[mid] = nodes[hi];
|
||||
nodes[hi] = tmp;
|
||||
|
||||
if (nodes[lo].getDistance() > nodes[mid].getDistance()) {
|
||||
// Swap.
|
||||
Node tmp2 = nodes[lo];
|
||||
nodes[lo] = nodes[mid];
|
||||
nodes[mid] = tmp2;
|
||||
}
|
||||
}
|
||||
|
||||
// Start one past lo since already handled lo.
|
||||
|
||||
int left = lo + 1;
|
||||
|
||||
// Similarly, end one before hi since already handled hi.
|
||||
|
||||
int right = hi - 1;
|
||||
|
||||
// If there are three or fewer elements, we are done.
|
||||
|
||||
if (left >= right) {
|
||||
return;
|
||||
}
|
||||
|
||||
Node partition = nodes[mid];
|
||||
|
||||
while (true) {
|
||||
while (nodes[right].getDistance() > partition.getDistance()) {
|
||||
--right;
|
||||
}
|
||||
|
||||
while (left < right && nodes[left].getDistance() <= partition.getDistance()) {
|
||||
++left;
|
||||
}
|
||||
|
||||
if (left < right) {
|
||||
// Swap.
|
||||
Node tmp = nodes[left];
|
||||
nodes[left] = nodes[right];
|
||||
nodes[right] = tmp;
|
||||
|
||||
--right;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
sort(nodes, lo, left);
|
||||
sort(nodes, left + 1, hi);
|
||||
}
|
||||
}
|
@ -1,56 +0,0 @@
|
||||
package edu.mit.simile.vicino.vptree;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
/**
|
||||
* @author Paolo Ciccarese
|
||||
*/
|
||||
public class TNode implements Serializable {
|
||||
|
||||
private static final long serialVersionUID = -217604190976851241L;
|
||||
|
||||
private final Serializable obj;
|
||||
private double median;
|
||||
private TNode left;
|
||||
private TNode right;
|
||||
|
||||
/**
|
||||
* The Object will be fixed during the instantiation of the node, while the
|
||||
* children will be defined in another iteration of the algorithm,
|
||||
*/
|
||||
public TNode(Serializable obj) {
|
||||
this.obj = obj;
|
||||
}
|
||||
|
||||
public Serializable get() {
|
||||
return this.obj;
|
||||
}
|
||||
|
||||
public void setMedian(double median) {
|
||||
this.median = median;
|
||||
}
|
||||
|
||||
public double getMedian() {
|
||||
return median;
|
||||
}
|
||||
|
||||
public void setLeft(TNode leftNode) {
|
||||
this.left = leftNode;
|
||||
}
|
||||
|
||||
public TNode getLeft() {
|
||||
return left;
|
||||
}
|
||||
|
||||
public void setRight(TNode rightNode) {
|
||||
this.right = rightNode;
|
||||
}
|
||||
|
||||
public TNode getRight() {
|
||||
return right;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return this.obj.toString();
|
||||
}
|
||||
}
|
@ -1,33 +0,0 @@
|
||||
package edu.mit.simile.vicino.vptree;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
/**
|
||||
* The VPTree class.
|
||||
*
|
||||
* @author Paolo Ciccarese
|
||||
*/
|
||||
public class VPTree implements Serializable {
|
||||
|
||||
private static final long serialVersionUID = 1291056732155841123L;
|
||||
|
||||
private TNode root;
|
||||
|
||||
/**
|
||||
* Sets the root of the VPTree.
|
||||
*
|
||||
* @param root The VPTree root.
|
||||
*/
|
||||
public void setRoot(TNode root) {
|
||||
this.root = root;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the root of the VPTree.
|
||||
*
|
||||
* @return The VPTree root.
|
||||
*/
|
||||
public TNode getRoot() {
|
||||
return root;
|
||||
}
|
||||
}
|
@ -1,192 +0,0 @@
|
||||
package edu.mit.simile.vicino.vptree;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Collection;
|
||||
import java.util.HashSet;
|
||||
import java.util.Random;
|
||||
import java.util.Set;
|
||||
|
||||
import edu.mit.simile.vicino.distances.Distance;
|
||||
|
||||
/**
|
||||
* @author Paolo Ciccarese
|
||||
* @author Stefano Mazzocchi
|
||||
*/
|
||||
public class VPTreeBuilder {
|
||||
|
||||
private static final boolean DEBUG = false;
|
||||
private static final boolean OPTIMIZED = false;
|
||||
private static final int sample_size = 10;
|
||||
|
||||
private Random generator = new Random(System.currentTimeMillis());
|
||||
|
||||
private final Distance distance;
|
||||
|
||||
private Set<Node> nodes = new HashSet<Node>();
|
||||
|
||||
/**
|
||||
* Defines a VPTree Builder for a specific distance.
|
||||
*
|
||||
* @param distance
|
||||
* The class implementing the distance.
|
||||
*/
|
||||
public VPTreeBuilder(Distance distance) {
|
||||
this.distance = distance;
|
||||
}
|
||||
|
||||
public Set<Node> getNodes() {
|
||||
return this.nodes;
|
||||
}
|
||||
|
||||
public void populate(Serializable s) {
|
||||
nodes.add(new Node(s));
|
||||
}
|
||||
|
||||
public VPTree buildVPTree() {
|
||||
if (DEBUG) {
|
||||
for (Node n : this.nodes) {
|
||||
System.out.println(n.get().toString());
|
||||
}
|
||||
System.out.println();
|
||||
}
|
||||
Node[] nodes_array = this.nodes.toArray(new Node[this.nodes.size()]);
|
||||
VPTree tree = new VPTree();
|
||||
if (nodes_array.length > 0) {
|
||||
tree.setRoot(makeNode(nodes_array, 0, nodes_array.length - 1));
|
||||
}
|
||||
return tree;
|
||||
}
|
||||
|
||||
public VPTree buildVPTree(Collection<? extends Serializable> values) {
|
||||
reset();
|
||||
for (Serializable s : values) {
|
||||
populate(s);
|
||||
}
|
||||
return buildVPTree();
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
this.nodes.clear();
|
||||
}
|
||||
|
||||
private TNode makeNode(Node nodes[], int begin, int end) {
|
||||
|
||||
int delta = end - begin;
|
||||
|
||||
if (DEBUG) System.out.println("\ndelta: " + delta);
|
||||
|
||||
if (delta == 0) {
|
||||
TNode vpNode = new TNode(nodes[begin].get());
|
||||
vpNode.setMedian(0);
|
||||
return vpNode;
|
||||
} else if (delta < 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
Node randomNode = getVantagePoint(nodes, begin, end);
|
||||
TNode vpNode = new TNode(randomNode.get());
|
||||
|
||||
if (DEBUG) System.out.println("\nvp-node: " + vpNode.get().toString());
|
||||
|
||||
calculateDistances(vpNode, nodes, begin, end);
|
||||
orderDistances(nodes, begin, end);
|
||||
fixVantagPoint(randomNode, nodes, begin, end);
|
||||
|
||||
if (DEBUG) {
|
||||
for (int i = begin; i <= end; i++) {
|
||||
System.out.println(" +-- " + nodes[i].getDistance() + " --> " + nodes[i].get());
|
||||
}
|
||||
}
|
||||
|
||||
float median = (float) median(nodes, begin, end);
|
||||
vpNode.setMedian(median);
|
||||
|
||||
int i = 0;
|
||||
for (i = begin + 1; i < end; i++) {
|
||||
if (nodes[i].getDistance() >= median) {
|
||||
vpNode.setLeft(makeNode(nodes, begin + 1, i - 1));
|
||||
break;
|
||||
}
|
||||
}
|
||||
vpNode.setRight(makeNode(nodes, i, end));
|
||||
|
||||
return vpNode;
|
||||
}
|
||||
|
||||
private Node getVantagePoint(Node nodes[], int begin, int end) {
|
||||
if (OPTIMIZED) {
|
||||
Node buffer[] = new Node[sample_size];
|
||||
for (int i = 0; i < sample_size; i++) {
|
||||
buffer[i] = getRandomNode(nodes,begin,end);
|
||||
}
|
||||
|
||||
double bestSpread = 0;
|
||||
Node bestNode = buffer[0];
|
||||
for (int i = 0; i < sample_size; i++) {
|
||||
calculateDistances(new TNode(buffer[i]), buffer, 0, buffer.length - 1);
|
||||
orderDistances(nodes, begin, end);
|
||||
double median = (double) median(nodes, begin, end);
|
||||
double spread = deviation(buffer, median);
|
||||
System.out.println(" " + spread);
|
||||
if (spread > bestSpread) {
|
||||
bestSpread = spread;
|
||||
bestNode = buffer[i];
|
||||
}
|
||||
}
|
||||
|
||||
System.out.println("best: " + bestSpread);
|
||||
return bestNode;
|
||||
} else {
|
||||
return getRandomNode(nodes,begin,end);
|
||||
}
|
||||
}
|
||||
|
||||
private Node getRandomNode(Node nodes[], int begin, int end) {
|
||||
return nodes[begin + generator.nextInt(end - begin)];
|
||||
}
|
||||
|
||||
private double deviation(Node buffer[], double median) {
|
||||
double sum = 0;
|
||||
for (int i = 0; i < buffer.length; i++) {
|
||||
sum += Math.pow(buffer[i].getDistance() - median, 2);
|
||||
}
|
||||
return sum / buffer.length;
|
||||
}
|
||||
|
||||
public double median(Node nodes[], int begin, int end) {
|
||||
int delta = end - begin;
|
||||
int middle = delta / 2;
|
||||
|
||||
if (delta % 2 == 0) {
|
||||
return nodes[begin + middle].getDistance();
|
||||
} else {
|
||||
return (nodes[begin + middle].getDistance() + nodes[begin + middle + 1].getDistance()) / 2.0d;
|
||||
}
|
||||
}
|
||||
|
||||
private void calculateDistances(TNode pivot, Node nodes[], int begin, int end) {
|
||||
Serializable x = pivot.get();
|
||||
for (int i = begin; i <= end; i++) {
|
||||
Serializable y = nodes[i].get();
|
||||
double d = (x == y || x.equals(y)) ? 0.0d : distance.d(x.toString(), y.toString());
|
||||
nodes[i].setDistance(d);
|
||||
}
|
||||
}
|
||||
|
||||
private void fixVantagPoint(Node pivot, Node nodes[], int begin, int end) {
|
||||
for (int i = begin; i < end; i++) {
|
||||
if (nodes[i] == pivot) {
|
||||
if (i > begin) {
|
||||
Node tmp = nodes[begin];
|
||||
nodes[begin] = pivot;
|
||||
nodes[i] = tmp;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void orderDistances(Node nodes[], int begin, int end) {
|
||||
NodeSorter.sort(nodes, begin, end);
|
||||
}
|
||||
}
|
@ -1,59 +0,0 @@
|
||||
package edu.mit.simile.vicino.vptree;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import edu.mit.simile.vicino.distances.Distance;
|
||||
|
||||
/**
|
||||
* @author Paolo Ciccarese
|
||||
*/
|
||||
public class VPTreeSeeker {
|
||||
|
||||
private static final boolean DEBUG = false;
|
||||
|
||||
VPTree tree;
|
||||
Distance distance;
|
||||
|
||||
public VPTreeSeeker(Distance distance, VPTree tree) {
|
||||
this.distance = distance;
|
||||
this.tree = tree;
|
||||
}
|
||||
|
||||
public Set<Serializable> range(Serializable query, double range) {
|
||||
if (DEBUG) System.out.println("--------------- " + query + " " + range);
|
||||
return rangeTraversal(query, range, tree.getRoot(), new HashSet<Serializable>());
|
||||
}
|
||||
|
||||
private Set<Serializable> rangeTraversal(Serializable query, double range, TNode tNode, Set<Serializable> results) {
|
||||
|
||||
if (DEBUG) System.out.println("> " + tNode);
|
||||
|
||||
if (tNode != null) {
|
||||
double distance = this.distance.d(query.toString(), tNode.get().toString());
|
||||
|
||||
if (distance <= range) {
|
||||
if (DEBUG) System.out.println("*** add ***");
|
||||
results.add(tNode.get());
|
||||
}
|
||||
|
||||
if ((distance + range) < tNode.getMedian()) {
|
||||
if (DEBUG) System.out.println("left: " + distance + " + " + range + " < " + tNode.getMedian());
|
||||
rangeTraversal(query, range, tNode.getLeft(), results);
|
||||
} else if ((distance - range) > tNode.getMedian()) {
|
||||
if (DEBUG) System.out.println("right: " + distance + " + " + range + " > " + tNode.getMedian());
|
||||
rangeTraversal(query, range, tNode.getRight(), results);
|
||||
} else {
|
||||
if (DEBUG) System.out.println("left & right: " + distance + " + " + range + " = " + tNode.getMedian());
|
||||
rangeTraversal(query, range, tNode.getLeft(), results);
|
||||
rangeTraversal(query, range, tNode.getRight(), results);
|
||||
}
|
||||
}
|
||||
|
||||
if (DEBUG) System.out.println("< " + tNode);
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
}
|
@ -1,136 +0,0 @@
|
||||
/*
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution, if
|
||||
* any, must include the following acknowlegement:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowlegement may appear in the software itself,
|
||||
* if and wherever such third-party acknowlegements normally appear.
|
||||
*
|
||||
* 4. The names "Ant" and "Apache Software
|
||||
* Foundation" must not be used to endorse or promote products derived
|
||||
* from this software without prior written permission. For written
|
||||
* permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache"
|
||||
* nor may "Apache" appear in their names without prior written
|
||||
* permission of the Apache Group.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
/*
|
||||
* This package is based on the work done by Keiron Liddle, Aftex Software
|
||||
* <keiron@aftexsw.com> to whom the Ant project is very grateful for his
|
||||
* great code.
|
||||
*/
|
||||
|
||||
package org.apache.tools.bzip2;
|
||||
|
||||
/**
|
||||
* Base class for both the compress and decompress classes.
|
||||
* Holds common arrays, and static data.
|
||||
*
|
||||
* @author <a href="mailto:keiron@aftexsw.com">Keiron Liddle</a>
|
||||
*/
|
||||
public interface BZip2Constants {
|
||||
|
||||
int baseBlockSize = 100000;
|
||||
int MAX_ALPHA_SIZE = 258;
|
||||
int MAX_CODE_LEN = 23;
|
||||
int RUNA = 0;
|
||||
int RUNB = 1;
|
||||
int N_GROUPS = 6;
|
||||
int G_SIZE = 50;
|
||||
int N_ITERS = 4;
|
||||
int MAX_SELECTORS = (2 + (900000 / G_SIZE));
|
||||
int NUM_OVERSHOOT_BYTES = 20;
|
||||
|
||||
int[] rNums = {
|
||||
619, 720, 127, 481, 931, 816, 813, 233, 566, 247,
|
||||
985, 724, 205, 454, 863, 491, 741, 242, 949, 214,
|
||||
733, 859, 335, 708, 621, 574, 73, 654, 730, 472,
|
||||
419, 436, 278, 496, 867, 210, 399, 680, 480, 51,
|
||||
878, 465, 811, 169, 869, 675, 611, 697, 867, 561,
|
||||
862, 687, 507, 283, 482, 129, 807, 591, 733, 623,
|
||||
150, 238, 59, 379, 684, 877, 625, 169, 643, 105,
|
||||
170, 607, 520, 932, 727, 476, 693, 425, 174, 647,
|
||||
73, 122, 335, 530, 442, 853, 695, 249, 445, 515,
|
||||
909, 545, 703, 919, 874, 474, 882, 500, 594, 612,
|
||||
641, 801, 220, 162, 819, 984, 589, 513, 495, 799,
|
||||
161, 604, 958, 533, 221, 400, 386, 867, 600, 782,
|
||||
382, 596, 414, 171, 516, 375, 682, 485, 911, 276,
|
||||
98, 553, 163, 354, 666, 933, 424, 341, 533, 870,
|
||||
227, 730, 475, 186, 263, 647, 537, 686, 600, 224,
|
||||
469, 68, 770, 919, 190, 373, 294, 822, 808, 206,
|
||||
184, 943, 795, 384, 383, 461, 404, 758, 839, 887,
|
||||
715, 67, 618, 276, 204, 918, 873, 777, 604, 560,
|
||||
951, 160, 578, 722, 79, 804, 96, 409, 713, 940,
|
||||
652, 934, 970, 447, 318, 353, 859, 672, 112, 785,
|
||||
645, 863, 803, 350, 139, 93, 354, 99, 820, 908,
|
||||
609, 772, 154, 274, 580, 184, 79, 626, 630, 742,
|
||||
653, 282, 762, 623, 680, 81, 927, 626, 789, 125,
|
||||
411, 521, 938, 300, 821, 78, 343, 175, 128, 250,
|
||||
170, 774, 972, 275, 999, 639, 495, 78, 352, 126,
|
||||
857, 956, 358, 619, 580, 124, 737, 594, 701, 612,
|
||||
669, 112, 134, 694, 363, 992, 809, 743, 168, 974,
|
||||
944, 375, 748, 52, 600, 747, 642, 182, 862, 81,
|
||||
344, 805, 988, 739, 511, 655, 814, 334, 249, 515,
|
||||
897, 955, 664, 981, 649, 113, 974, 459, 893, 228,
|
||||
433, 837, 553, 268, 926, 240, 102, 654, 459, 51,
|
||||
686, 754, 806, 760, 493, 403, 415, 394, 687, 700,
|
||||
946, 670, 656, 610, 738, 392, 760, 799, 887, 653,
|
||||
978, 321, 576, 617, 626, 502, 894, 679, 243, 440,
|
||||
680, 879, 194, 572, 640, 724, 926, 56, 204, 700,
|
||||
707, 151, 457, 449, 797, 195, 791, 558, 945, 679,
|
||||
297, 59, 87, 824, 713, 663, 412, 693, 342, 606,
|
||||
134, 108, 571, 364, 631, 212, 174, 643, 304, 329,
|
||||
343, 97, 430, 751, 497, 314, 983, 374, 822, 928,
|
||||
140, 206, 73, 263, 980, 736, 876, 478, 430, 305,
|
||||
170, 514, 364, 692, 829, 82, 855, 953, 676, 246,
|
||||
369, 970, 294, 750, 807, 827, 150, 790, 288, 923,
|
||||
804, 378, 215, 828, 592, 281, 565, 555, 710, 82,
|
||||
896, 831, 547, 261, 524, 462, 293, 465, 502, 56,
|
||||
661, 821, 976, 991, 658, 869, 905, 758, 745, 193,
|
||||
768, 550, 608, 933, 378, 286, 215, 979, 792, 961,
|
||||
61, 688, 793, 644, 986, 403, 106, 366, 905, 644,
|
||||
372, 567, 466, 434, 645, 210, 389, 550, 919, 135,
|
||||
780, 773, 635, 389, 707, 100, 626, 958, 165, 504,
|
||||
920, 176, 193, 713, 857, 265, 203, 50, 668, 108,
|
||||
645, 990, 626, 197, 510, 357, 358, 850, 858, 364,
|
||||
936, 638
|
||||
};
|
||||
}
|
@ -1,865 +0,0 @@
|
||||
/*
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001-2003 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution, if
|
||||
* any, must include the following acknowlegement:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowlegement may appear in the software itself,
|
||||
* if and wherever such third-party acknowlegements normally appear.
|
||||
*
|
||||
* 4. The names "Ant" and "Apache Software
|
||||
* Foundation" must not be used to endorse or promote products derived
|
||||
* from this software without prior written permission. For written
|
||||
* permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache"
|
||||
* nor may "Apache" appear in their names without prior written
|
||||
* permission of the Apache Group.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
/*
|
||||
* This package is based on the work done by Keiron Liddle, Aftex Software
|
||||
* <keiron@aftexsw.com> to whom the Ant project is very grateful for his
|
||||
* great code.
|
||||
*/
|
||||
package org.apache.tools.bzip2;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
/**
|
||||
* An input stream that decompresses from the BZip2 format (without the file
|
||||
* header chars) to be read as any other stream.
|
||||
*
|
||||
* @author <a href="mailto:keiron@aftexsw.com">Keiron Liddle</a>
|
||||
*/
|
||||
public class CBZip2InputStream extends InputStream implements BZip2Constants {
|
||||
private static void cadvise() {
|
||||
System.out.println("CRC Error");
|
||||
//throw new CCoruptionError();
|
||||
}
|
||||
|
||||
private static void compressedStreamEOF() {
|
||||
cadvise();
|
||||
}
|
||||
|
||||
private void makeMaps() {
|
||||
int i;
|
||||
nInUse = 0;
|
||||
for (i = 0; i < 256; i++) {
|
||||
if (inUse[i]) {
|
||||
seqToUnseq[nInUse] = (char) i;
|
||||
unseqToSeq[i] = (char) nInUse;
|
||||
nInUse++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
index of the last char in the block, so
|
||||
the block size == last + 1.
|
||||
*/
|
||||
private int last;
|
||||
|
||||
/*
|
||||
index in zptr[] of original string after sorting.
|
||||
*/
|
||||
private int origPtr;
|
||||
|
||||
/*
|
||||
always: in the range 0 .. 9.
|
||||
The current block size is 100000 * this number.
|
||||
*/
|
||||
private int blockSize100k;
|
||||
|
||||
private boolean blockRandomised;
|
||||
|
||||
private int bsBuff;
|
||||
private int bsLive;
|
||||
private CRC mCrc = new CRC();
|
||||
|
||||
private boolean[] inUse = new boolean[256];
|
||||
private int nInUse;
|
||||
|
||||
private char[] seqToUnseq = new char[256];
|
||||
private char[] unseqToSeq = new char[256];
|
||||
|
||||
private char[] selector = new char[MAX_SELECTORS];
|
||||
private char[] selectorMtf = new char[MAX_SELECTORS];
|
||||
|
||||
private int[] tt;
|
||||
private char[] ll8;
|
||||
|
||||
/*
|
||||
freq table collected to save a pass over the data
|
||||
during decompression.
|
||||
*/
|
||||
private int[] unzftab = new int[256];
|
||||
|
||||
private int[][] limit = new int[N_GROUPS][MAX_ALPHA_SIZE];
|
||||
private int[][] base = new int[N_GROUPS][MAX_ALPHA_SIZE];
|
||||
private int[][] perm = new int[N_GROUPS][MAX_ALPHA_SIZE];
|
||||
private int[] minLens = new int[N_GROUPS];
|
||||
|
||||
private InputStream bsStream;
|
||||
|
||||
private boolean streamEnd = false;
|
||||
|
||||
private int currentChar = -1;
|
||||
|
||||
private static final int START_BLOCK_STATE = 1;
|
||||
private static final int RAND_PART_A_STATE = 2;
|
||||
private static final int RAND_PART_B_STATE = 3;
|
||||
private static final int RAND_PART_C_STATE = 4;
|
||||
private static final int NO_RAND_PART_A_STATE = 5;
|
||||
private static final int NO_RAND_PART_B_STATE = 6;
|
||||
private static final int NO_RAND_PART_C_STATE = 7;
|
||||
|
||||
private int currentState = START_BLOCK_STATE;
|
||||
|
||||
private int storedBlockCRC, storedCombinedCRC;
|
||||
private int computedBlockCRC, computedCombinedCRC;
|
||||
|
||||
int i2, count, chPrev, ch2;
|
||||
int i, tPos;
|
||||
int rNToGo = 0;
|
||||
int rTPos = 0;
|
||||
int j2;
|
||||
char z;
|
||||
|
||||
public CBZip2InputStream(InputStream zStream) {
|
||||
ll8 = null;
|
||||
tt = null;
|
||||
bsSetStream(zStream);
|
||||
initialize();
|
||||
initBlock();
|
||||
setupBlock();
|
||||
}
|
||||
|
||||
public int read() {
|
||||
if (streamEnd) {
|
||||
return -1;
|
||||
} else {
|
||||
int retChar = currentChar;
|
||||
switch(currentState) {
|
||||
case START_BLOCK_STATE:
|
||||
break;
|
||||
case RAND_PART_A_STATE:
|
||||
break;
|
||||
case RAND_PART_B_STATE:
|
||||
setupRandPartB();
|
||||
break;
|
||||
case RAND_PART_C_STATE:
|
||||
setupRandPartC();
|
||||
break;
|
||||
case NO_RAND_PART_A_STATE:
|
||||
break;
|
||||
case NO_RAND_PART_B_STATE:
|
||||
setupNoRandPartB();
|
||||
break;
|
||||
case NO_RAND_PART_C_STATE:
|
||||
setupNoRandPartC();
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return retChar;
|
||||
}
|
||||
}
|
||||
|
||||
private void initialize() {
|
||||
char magic3, magic4;
|
||||
magic3 = bsGetUChar();
|
||||
magic4 = bsGetUChar();
|
||||
if (magic3 != 'h' || magic4 < '1' || magic4 > '9') {
|
||||
bsFinishedWithStream();
|
||||
streamEnd = true;
|
||||
return;
|
||||
}
|
||||
|
||||
setDecompressStructureSizes(magic4 - '0');
|
||||
computedCombinedCRC = 0;
|
||||
}
|
||||
|
||||
private void initBlock() {
|
||||
char magic1, magic2, magic3, magic4;
|
||||
char magic5, magic6;
|
||||
magic1 = bsGetUChar();
|
||||
magic2 = bsGetUChar();
|
||||
magic3 = bsGetUChar();
|
||||
magic4 = bsGetUChar();
|
||||
magic5 = bsGetUChar();
|
||||
magic6 = bsGetUChar();
|
||||
if (magic1 == 0x17 && magic2 == 0x72 && magic3 == 0x45
|
||||
&& magic4 == 0x38 && magic5 == 0x50 && magic6 == 0x90) {
|
||||
complete();
|
||||
return;
|
||||
}
|
||||
|
||||
if (magic1 != 0x31 || magic2 != 0x41 || magic3 != 0x59
|
||||
|| magic4 != 0x26 || magic5 != 0x53 || magic6 != 0x59) {
|
||||
badBlockHeader();
|
||||
streamEnd = true;
|
||||
return;
|
||||
}
|
||||
|
||||
storedBlockCRC = bsGetInt32();
|
||||
|
||||
if (bsR(1) == 1) {
|
||||
blockRandomised = true;
|
||||
} else {
|
||||
blockRandomised = false;
|
||||
}
|
||||
|
||||
// currBlockNo++;
|
||||
getAndMoveToFrontDecode();
|
||||
|
||||
mCrc.initialiseCRC();
|
||||
currentState = START_BLOCK_STATE;
|
||||
}
|
||||
|
||||
private void endBlock() {
|
||||
computedBlockCRC = mCrc.getFinalCRC();
|
||||
/* A bad CRC is considered a fatal error. */
|
||||
if (storedBlockCRC != computedBlockCRC) {
|
||||
crcError();
|
||||
}
|
||||
|
||||
computedCombinedCRC = (computedCombinedCRC << 1)
|
||||
| (computedCombinedCRC >>> 31);
|
||||
computedCombinedCRC ^= computedBlockCRC;
|
||||
}
|
||||
|
||||
private void complete() {
|
||||
storedCombinedCRC = bsGetInt32();
|
||||
if (storedCombinedCRC != computedCombinedCRC) {
|
||||
crcError();
|
||||
}
|
||||
|
||||
bsFinishedWithStream();
|
||||
streamEnd = true;
|
||||
}
|
||||
|
||||
private static void blockOverrun() {
|
||||
cadvise();
|
||||
}
|
||||
|
||||
private static void badBlockHeader() {
|
||||
cadvise();
|
||||
}
|
||||
|
||||
private static void crcError() {
|
||||
cadvise();
|
||||
}
|
||||
|
||||
private void bsFinishedWithStream() {
|
||||
try {
|
||||
if (this.bsStream != null) {
|
||||
if (this.bsStream != System.in) {
|
||||
this.bsStream.close();
|
||||
this.bsStream = null;
|
||||
}
|
||||
}
|
||||
} catch (IOException ioe) {
|
||||
//ignore
|
||||
}
|
||||
}
|
||||
|
||||
private void bsSetStream(InputStream f) {
|
||||
bsStream = f;
|
||||
bsLive = 0;
|
||||
bsBuff = 0;
|
||||
}
|
||||
|
||||
private int bsR(int n) {
|
||||
int v;
|
||||
while (bsLive < n) {
|
||||
int zzi;
|
||||
char thech = 0;
|
||||
try {
|
||||
thech = (char) bsStream.read();
|
||||
} catch (IOException e) {
|
||||
compressedStreamEOF();
|
||||
}
|
||||
if (thech == -1) {
|
||||
compressedStreamEOF();
|
||||
}
|
||||
zzi = thech;
|
||||
bsBuff = (bsBuff << 8) | (zzi & 0xff);
|
||||
bsLive += 8;
|
||||
}
|
||||
|
||||
v = (bsBuff >> (bsLive - n)) & ((1 << n) - 1);
|
||||
bsLive -= n;
|
||||
return v;
|
||||
}
|
||||
|
||||
private char bsGetUChar() {
|
||||
return (char) bsR(8);
|
||||
}
|
||||
|
||||
private int bsGetint() {
|
||||
int u = 0;
|
||||
u = (u << 8) | bsR(8);
|
||||
u = (u << 8) | bsR(8);
|
||||
u = (u << 8) | bsR(8);
|
||||
u = (u << 8) | bsR(8);
|
||||
return u;
|
||||
}
|
||||
|
||||
private int bsGetIntVS(int numBits) {
|
||||
return (int) bsR(numBits);
|
||||
}
|
||||
|
||||
private int bsGetInt32() {
|
||||
return (int) bsGetint();
|
||||
}
|
||||
|
||||
private void hbCreateDecodeTables(int[] limit, int[] base,
|
||||
int[] perm, char[] length,
|
||||
int minLen, int maxLen, int alphaSize) {
|
||||
int pp, i, j, vec;
|
||||
|
||||
pp = 0;
|
||||
for (i = minLen; i <= maxLen; i++) {
|
||||
for (j = 0; j < alphaSize; j++) {
|
||||
if (length[j] == i) {
|
||||
perm[pp] = j;
|
||||
pp++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < MAX_CODE_LEN; i++) {
|
||||
base[i] = 0;
|
||||
}
|
||||
for (i = 0; i < alphaSize; i++) {
|
||||
base[length[i] + 1]++;
|
||||
}
|
||||
|
||||
for (i = 1; i < MAX_CODE_LEN; i++) {
|
||||
base[i] += base[i - 1];
|
||||
}
|
||||
|
||||
for (i = 0; i < MAX_CODE_LEN; i++) {
|
||||
limit[i] = 0;
|
||||
}
|
||||
vec = 0;
|
||||
|
||||
for (i = minLen; i <= maxLen; i++) {
|
||||
vec += (base[i + 1] - base[i]);
|
||||
limit[i] = vec - 1;
|
||||
vec <<= 1;
|
||||
}
|
||||
for (i = minLen + 1; i <= maxLen; i++) {
|
||||
base[i] = ((limit[i - 1] + 1) << 1) - base[i];
|
||||
}
|
||||
}
|
||||
|
||||
private void recvDecodingTables() {
|
||||
char len[][] = new char[N_GROUPS][MAX_ALPHA_SIZE];
|
||||
int i, j, t, nGroups, nSelectors, alphaSize;
|
||||
int minLen, maxLen;
|
||||
boolean[] inUse16 = new boolean[16];
|
||||
|
||||
/* Receive the mapping table */
|
||||
for (i = 0; i < 16; i++) {
|
||||
if (bsR(1) == 1) {
|
||||
inUse16[i] = true;
|
||||
} else {
|
||||
inUse16[i] = false;
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < 256; i++) {
|
||||
inUse[i] = false;
|
||||
}
|
||||
|
||||
for (i = 0; i < 16; i++) {
|
||||
if (inUse16[i]) {
|
||||
for (j = 0; j < 16; j++) {
|
||||
if (bsR(1) == 1) {
|
||||
inUse[i * 16 + j] = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
makeMaps();
|
||||
alphaSize = nInUse + 2;
|
||||
|
||||
/* Now the selectors */
|
||||
nGroups = bsR(3);
|
||||
nSelectors = bsR(15);
|
||||
for (i = 0; i < nSelectors; i++) {
|
||||
j = 0;
|
||||
while (bsR(1) == 1) {
|
||||
j++;
|
||||
}
|
||||
selectorMtf[i] = (char) j;
|
||||
}
|
||||
|
||||
/* Undo the MTF values for the selectors. */
|
||||
{
|
||||
char[] pos = new char[N_GROUPS];
|
||||
char tmp, v;
|
||||
for (v = 0; v < nGroups; v++) {
|
||||
pos[v] = v;
|
||||
}
|
||||
|
||||
for (i = 0; i < nSelectors; i++) {
|
||||
v = selectorMtf[i];
|
||||
tmp = pos[v];
|
||||
while (v > 0) {
|
||||
pos[v] = pos[v - 1];
|
||||
v--;
|
||||
}
|
||||
pos[0] = tmp;
|
||||
selector[i] = tmp;
|
||||
}
|
||||
}
|
||||
|
||||
/* Now the coding tables */
|
||||
for (t = 0; t < nGroups; t++) {
|
||||
int curr = bsR(5);
|
||||
for (i = 0; i < alphaSize; i++) {
|
||||
while (bsR(1) == 1) {
|
||||
if (bsR(1) == 0) {
|
||||
curr++;
|
||||
} else {
|
||||
curr--;
|
||||
}
|
||||
}
|
||||
len[t][i] = (char) curr;
|
||||
}
|
||||
}
|
||||
|
||||
/* Create the Huffman decoding tables */
|
||||
for (t = 0; t < nGroups; t++) {
|
||||
minLen = 32;
|
||||
maxLen = 0;
|
||||
for (i = 0; i < alphaSize; i++) {
|
||||
if (len[t][i] > maxLen) {
|
||||
maxLen = len[t][i];
|
||||
}
|
||||
if (len[t][i] < minLen) {
|
||||
minLen = len[t][i];
|
||||
}
|
||||
}
|
||||
hbCreateDecodeTables(limit[t], base[t], perm[t], len[t], minLen,
|
||||
maxLen, alphaSize);
|
||||
minLens[t] = minLen;
|
||||
}
|
||||
}
|
||||
|
||||
private void getAndMoveToFrontDecode() {
|
||||
char[] yy = new char[256];
|
||||
int i, j, nextSym, limitLast;
|
||||
int EOB, groupNo, groupPos;
|
||||
|
||||
limitLast = baseBlockSize * blockSize100k;
|
||||
origPtr = bsGetIntVS(24);
|
||||
|
||||
recvDecodingTables();
|
||||
EOB = nInUse + 1;
|
||||
groupNo = -1;
|
||||
groupPos = 0;
|
||||
|
||||
/*
|
||||
Setting up the unzftab entries here is not strictly
|
||||
necessary, but it does save having to do it later
|
||||
in a separate pass, and so saves a block's worth of
|
||||
cache misses.
|
||||
*/
|
||||
for (i = 0; i <= 255; i++) {
|
||||
unzftab[i] = 0;
|
||||
}
|
||||
|
||||
for (i = 0; i <= 255; i++) {
|
||||
yy[i] = (char) i;
|
||||
}
|
||||
|
||||
last = -1;
|
||||
|
||||
{
|
||||
int zt, zn, zvec, zj;
|
||||
if (groupPos == 0) {
|
||||
groupNo++;
|
||||
groupPos = G_SIZE;
|
||||
}
|
||||
groupPos--;
|
||||
zt = selector[groupNo];
|
||||
zn = minLens[zt];
|
||||
zvec = bsR(zn);
|
||||
while (zvec > limit[zt][zn]) {
|
||||
zn++;
|
||||
{
|
||||
{
|
||||
while (bsLive < 1) {
|
||||
int zzi;
|
||||
char thech = 0;
|
||||
try {
|
||||
thech = (char) bsStream.read();
|
||||
} catch (IOException e) {
|
||||
compressedStreamEOF();
|
||||
}
|
||||
if (thech == -1) {
|
||||
compressedStreamEOF();
|
||||
}
|
||||
zzi = thech;
|
||||
bsBuff = (bsBuff << 8) | (zzi & 0xff);
|
||||
bsLive += 8;
|
||||
}
|
||||
}
|
||||
zj = (bsBuff >> (bsLive - 1)) & 1;
|
||||
bsLive--;
|
||||
}
|
||||
zvec = (zvec << 1) | zj;
|
||||
}
|
||||
nextSym = perm[zt][zvec - base[zt][zn]];
|
||||
}
|
||||
|
||||
while (true) {
|
||||
|
||||
if (nextSym == EOB) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (nextSym == RUNA || nextSym == RUNB) {
|
||||
char ch;
|
||||
int s = -1;
|
||||
int N = 1;
|
||||
do {
|
||||
if (nextSym == RUNA) {
|
||||
s = s + (0 + 1) * N;
|
||||
} else if (nextSym == RUNB) {
|
||||
s = s + (1 + 1) * N;
|
||||
}
|
||||
N = N * 2;
|
||||
{
|
||||
int zt, zn, zvec, zj;
|
||||
if (groupPos == 0) {
|
||||
groupNo++;
|
||||
groupPos = G_SIZE;
|
||||
}
|
||||
groupPos--;
|
||||
zt = selector[groupNo];
|
||||
zn = minLens[zt];
|
||||
zvec = bsR(zn);
|
||||
while (zvec > limit[zt][zn]) {
|
||||
zn++;
|
||||
{
|
||||
{
|
||||
while (bsLive < 1) {
|
||||
int zzi;
|
||||
char thech = 0;
|
||||
try {
|
||||
thech = (char) bsStream.read();
|
||||
} catch (IOException e) {
|
||||
compressedStreamEOF();
|
||||
}
|
||||
if (thech == -1) {
|
||||
compressedStreamEOF();
|
||||
}
|
||||
zzi = thech;
|
||||
bsBuff = (bsBuff << 8) | (zzi & 0xff);
|
||||
bsLive += 8;
|
||||
}
|
||||
}
|
||||
zj = (bsBuff >> (bsLive - 1)) & 1;
|
||||
bsLive--;
|
||||
}
|
||||
zvec = (zvec << 1) | zj;
|
||||
}
|
||||
nextSym = perm[zt][zvec - base[zt][zn]];
|
||||
}
|
||||
} while (nextSym == RUNA || nextSym == RUNB);
|
||||
|
||||
s++;
|
||||
ch = seqToUnseq[yy[0]];
|
||||
unzftab[ch] += s;
|
||||
|
||||
while (s > 0) {
|
||||
last++;
|
||||
ll8[last] = ch;
|
||||
s--;
|
||||
}
|
||||
|
||||
if (last >= limitLast) {
|
||||
blockOverrun();
|
||||
}
|
||||
continue;
|
||||
} else {
|
||||
char tmp;
|
||||
last++;
|
||||
if (last >= limitLast) {
|
||||
blockOverrun();
|
||||
}
|
||||
|
||||
tmp = yy[nextSym - 1];
|
||||
unzftab[seqToUnseq[tmp]]++;
|
||||
ll8[last] = seqToUnseq[tmp];
|
||||
|
||||
/*
|
||||
This loop is hammered during decompression,
|
||||
hence the unrolling.
|
||||
|
||||
for (j = nextSym-1; j > 0; j--) yy[j] = yy[j-1];
|
||||
*/
|
||||
|
||||
j = nextSym - 1;
|
||||
for (; j > 3; j -= 4) {
|
||||
yy[j] = yy[j - 1];
|
||||
yy[j - 1] = yy[j - 2];
|
||||
yy[j - 2] = yy[j - 3];
|
||||
yy[j - 3] = yy[j - 4];
|
||||
}
|
||||
for (; j > 0; j--) {
|
||||
yy[j] = yy[j - 1];
|
||||
}
|
||||
|
||||
yy[0] = tmp;
|
||||
{
|
||||
int zt, zn, zvec, zj;
|
||||
if (groupPos == 0) {
|
||||
groupNo++;
|
||||
groupPos = G_SIZE;
|
||||
}
|
||||
groupPos--;
|
||||
zt = selector[groupNo];
|
||||
zn = minLens[zt];
|
||||
zvec = bsR(zn);
|
||||
while (zvec > limit[zt][zn]) {
|
||||
zn++;
|
||||
{
|
||||
{
|
||||
while (bsLive < 1) {
|
||||
int zzi;
|
||||
char thech = 0;
|
||||
try {
|
||||
thech = (char) bsStream.read();
|
||||
} catch (IOException e) {
|
||||
compressedStreamEOF();
|
||||
}
|
||||
zzi = thech;
|
||||
bsBuff = (bsBuff << 8) | (zzi & 0xff);
|
||||
bsLive += 8;
|
||||
}
|
||||
}
|
||||
zj = (bsBuff >> (bsLive - 1)) & 1;
|
||||
bsLive--;
|
||||
}
|
||||
zvec = (zvec << 1) | zj;
|
||||
}
|
||||
nextSym = perm[zt][zvec - base[zt][zn]];
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void setupBlock() {
|
||||
int[] cftab = new int[257];
|
||||
char ch;
|
||||
|
||||
cftab[0] = 0;
|
||||
for (i = 1; i <= 256; i++) {
|
||||
cftab[i] = unzftab[i - 1];
|
||||
}
|
||||
for (i = 1; i <= 256; i++) {
|
||||
cftab[i] += cftab[i - 1];
|
||||
}
|
||||
|
||||
for (i = 0; i <= last; i++) {
|
||||
ch = (char) ll8[i];
|
||||
tt[cftab[ch]] = i;
|
||||
cftab[ch]++;
|
||||
}
|
||||
cftab = null;
|
||||
|
||||
tPos = tt[origPtr];
|
||||
|
||||
count = 0;
|
||||
i2 = 0;
|
||||
ch2 = 256; /* not a char and not EOF */
|
||||
|
||||
if (blockRandomised) {
|
||||
rNToGo = 0;
|
||||
rTPos = 0;
|
||||
setupRandPartA();
|
||||
} else {
|
||||
setupNoRandPartA();
|
||||
}
|
||||
}
|
||||
|
||||
private void setupRandPartA() {
|
||||
if (i2 <= last) {
|
||||
chPrev = ch2;
|
||||
ch2 = ll8[tPos];
|
||||
tPos = tt[tPos];
|
||||
if (rNToGo == 0) {
|
||||
rNToGo = rNums[rTPos];
|
||||
rTPos++;
|
||||
if (rTPos == 512) {
|
||||
rTPos = 0;
|
||||
}
|
||||
}
|
||||
rNToGo--;
|
||||
ch2 ^= (int) ((rNToGo == 1) ? 1 : 0);
|
||||
i2++;
|
||||
|
||||
currentChar = ch2;
|
||||
currentState = RAND_PART_B_STATE;
|
||||
mCrc.updateCRC(ch2);
|
||||
} else {
|
||||
endBlock();
|
||||
initBlock();
|
||||
setupBlock();
|
||||
}
|
||||
}
|
||||
|
||||
private void setupNoRandPartA() {
|
||||
if (i2 <= last) {
|
||||
chPrev = ch2;
|
||||
ch2 = ll8[tPos];
|
||||
tPos = tt[tPos];
|
||||
i2++;
|
||||
|
||||
currentChar = ch2;
|
||||
currentState = NO_RAND_PART_B_STATE;
|
||||
mCrc.updateCRC(ch2);
|
||||
} else {
|
||||
endBlock();
|
||||
initBlock();
|
||||
setupBlock();
|
||||
}
|
||||
}
|
||||
|
||||
private void setupRandPartB() {
|
||||
if (ch2 != chPrev) {
|
||||
currentState = RAND_PART_A_STATE;
|
||||
count = 1;
|
||||
setupRandPartA();
|
||||
} else {
|
||||
count++;
|
||||
if (count >= 4) {
|
||||
z = ll8[tPos];
|
||||
tPos = tt[tPos];
|
||||
if (rNToGo == 0) {
|
||||
rNToGo = rNums[rTPos];
|
||||
rTPos++;
|
||||
if (rTPos == 512) {
|
||||
rTPos = 0;
|
||||
}
|
||||
}
|
||||
rNToGo--;
|
||||
z ^= ((rNToGo == 1) ? 1 : 0);
|
||||
j2 = 0;
|
||||
currentState = RAND_PART_C_STATE;
|
||||
setupRandPartC();
|
||||
} else {
|
||||
currentState = RAND_PART_A_STATE;
|
||||
setupRandPartA();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void setupRandPartC() {
|
||||
if (j2 < (int) z) {
|
||||
currentChar = ch2;
|
||||
mCrc.updateCRC(ch2);
|
||||
j2++;
|
||||
} else {
|
||||
currentState = RAND_PART_A_STATE;
|
||||
i2++;
|
||||
count = 0;
|
||||
setupRandPartA();
|
||||
}
|
||||
}
|
||||
|
||||
private void setupNoRandPartB() {
|
||||
if (ch2 != chPrev) {
|
||||
currentState = NO_RAND_PART_A_STATE;
|
||||
count = 1;
|
||||
setupNoRandPartA();
|
||||
} else {
|
||||
count++;
|
||||
if (count >= 4) {
|
||||
z = ll8[tPos];
|
||||
tPos = tt[tPos];
|
||||
currentState = NO_RAND_PART_C_STATE;
|
||||
j2 = 0;
|
||||
setupNoRandPartC();
|
||||
} else {
|
||||
currentState = NO_RAND_PART_A_STATE;
|
||||
setupNoRandPartA();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void setupNoRandPartC() {
|
||||
if (j2 < (int) z) {
|
||||
currentChar = ch2;
|
||||
mCrc.updateCRC(ch2);
|
||||
j2++;
|
||||
} else {
|
||||
currentState = NO_RAND_PART_A_STATE;
|
||||
i2++;
|
||||
count = 0;
|
||||
setupNoRandPartA();
|
||||
}
|
||||
}
|
||||
|
||||
private void setDecompressStructureSizes(int newSize100k) {
|
||||
if (!(0 <= newSize100k && newSize100k <= 9 && 0 <= blockSize100k
|
||||
&& blockSize100k <= 9)) {
|
||||
// throw new IOException("Invalid block size");
|
||||
}
|
||||
|
||||
blockSize100k = newSize100k;
|
||||
|
||||
if (newSize100k == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
int n = baseBlockSize * newSize100k;
|
||||
ll8 = new char[n];
|
||||
tt = new int[n];
|
||||
}
|
||||
}
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,167 +0,0 @@
|
||||
/*
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001-2002 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution, if
|
||||
* any, must include the following acknowlegement:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowlegement may appear in the software itself,
|
||||
* if and wherever such third-party acknowlegements normally appear.
|
||||
*
|
||||
* 4. The names "Ant" and "Apache Software
|
||||
* Foundation" must not be used to endorse or promote products derived
|
||||
* from this software without prior written permission. For written
|
||||
* permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache"
|
||||
* nor may "Apache" appear in their names without prior written
|
||||
* permission of the Apache Group.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
/*
|
||||
* This package is based on the work done by Keiron Liddle, Aftex Software
|
||||
* <keiron@aftexsw.com> to whom the Ant project is very grateful for his
|
||||
* great code.
|
||||
*/
|
||||
|
||||
package org.apache.tools.bzip2;
|
||||
|
||||
/**
|
||||
* A simple class the hold and calculate the CRC for sanity checking
|
||||
* of the data.
|
||||
*
|
||||
* @author <a href="mailto:keiron@aftexsw.com">Keiron Liddle</a>
|
||||
*/
|
||||
class CRC {
|
||||
public static int crc32Table[] = {
|
||||
0x00000000, 0x04c11db7, 0x09823b6e, 0x0d4326d9,
|
||||
0x130476dc, 0x17c56b6b, 0x1a864db2, 0x1e475005,
|
||||
0x2608edb8, 0x22c9f00f, 0x2f8ad6d6, 0x2b4bcb61,
|
||||
0x350c9b64, 0x31cd86d3, 0x3c8ea00a, 0x384fbdbd,
|
||||
0x4c11db70, 0x48d0c6c7, 0x4593e01e, 0x4152fda9,
|
||||
0x5f15adac, 0x5bd4b01b, 0x569796c2, 0x52568b75,
|
||||
0x6a1936c8, 0x6ed82b7f, 0x639b0da6, 0x675a1011,
|
||||
0x791d4014, 0x7ddc5da3, 0x709f7b7a, 0x745e66cd,
|
||||
0x9823b6e0, 0x9ce2ab57, 0x91a18d8e, 0x95609039,
|
||||
0x8b27c03c, 0x8fe6dd8b, 0x82a5fb52, 0x8664e6e5,
|
||||
0xbe2b5b58, 0xbaea46ef, 0xb7a96036, 0xb3687d81,
|
||||
0xad2f2d84, 0xa9ee3033, 0xa4ad16ea, 0xa06c0b5d,
|
||||
0xd4326d90, 0xd0f37027, 0xddb056fe, 0xd9714b49,
|
||||
0xc7361b4c, 0xc3f706fb, 0xceb42022, 0xca753d95,
|
||||
0xf23a8028, 0xf6fb9d9f, 0xfbb8bb46, 0xff79a6f1,
|
||||
0xe13ef6f4, 0xe5ffeb43, 0xe8bccd9a, 0xec7dd02d,
|
||||
0x34867077, 0x30476dc0, 0x3d044b19, 0x39c556ae,
|
||||
0x278206ab, 0x23431b1c, 0x2e003dc5, 0x2ac12072,
|
||||
0x128e9dcf, 0x164f8078, 0x1b0ca6a1, 0x1fcdbb16,
|
||||
0x018aeb13, 0x054bf6a4, 0x0808d07d, 0x0cc9cdca,
|
||||
0x7897ab07, 0x7c56b6b0, 0x71159069, 0x75d48dde,
|
||||
0x6b93dddb, 0x6f52c06c, 0x6211e6b5, 0x66d0fb02,
|
||||
0x5e9f46bf, 0x5a5e5b08, 0x571d7dd1, 0x53dc6066,
|
||||
0x4d9b3063, 0x495a2dd4, 0x44190b0d, 0x40d816ba,
|
||||
0xaca5c697, 0xa864db20, 0xa527fdf9, 0xa1e6e04e,
|
||||
0xbfa1b04b, 0xbb60adfc, 0xb6238b25, 0xb2e29692,
|
||||
0x8aad2b2f, 0x8e6c3698, 0x832f1041, 0x87ee0df6,
|
||||
0x99a95df3, 0x9d684044, 0x902b669d, 0x94ea7b2a,
|
||||
0xe0b41de7, 0xe4750050, 0xe9362689, 0xedf73b3e,
|
||||
0xf3b06b3b, 0xf771768c, 0xfa325055, 0xfef34de2,
|
||||
0xc6bcf05f, 0xc27dede8, 0xcf3ecb31, 0xcbffd686,
|
||||
0xd5b88683, 0xd1799b34, 0xdc3abded, 0xd8fba05a,
|
||||
0x690ce0ee, 0x6dcdfd59, 0x608edb80, 0x644fc637,
|
||||
0x7a089632, 0x7ec98b85, 0x738aad5c, 0x774bb0eb,
|
||||
0x4f040d56, 0x4bc510e1, 0x46863638, 0x42472b8f,
|
||||
0x5c007b8a, 0x58c1663d, 0x558240e4, 0x51435d53,
|
||||
0x251d3b9e, 0x21dc2629, 0x2c9f00f0, 0x285e1d47,
|
||||
0x36194d42, 0x32d850f5, 0x3f9b762c, 0x3b5a6b9b,
|
||||
0x0315d626, 0x07d4cb91, 0x0a97ed48, 0x0e56f0ff,
|
||||
0x1011a0fa, 0x14d0bd4d, 0x19939b94, 0x1d528623,
|
||||
0xf12f560e, 0xf5ee4bb9, 0xf8ad6d60, 0xfc6c70d7,
|
||||
0xe22b20d2, 0xe6ea3d65, 0xeba91bbc, 0xef68060b,
|
||||
0xd727bbb6, 0xd3e6a601, 0xdea580d8, 0xda649d6f,
|
||||
0xc423cd6a, 0xc0e2d0dd, 0xcda1f604, 0xc960ebb3,
|
||||
0xbd3e8d7e, 0xb9ff90c9, 0xb4bcb610, 0xb07daba7,
|
||||
0xae3afba2, 0xaafbe615, 0xa7b8c0cc, 0xa379dd7b,
|
||||
0x9b3660c6, 0x9ff77d71, 0x92b45ba8, 0x9675461f,
|
||||
0x8832161a, 0x8cf30bad, 0x81b02d74, 0x857130c3,
|
||||
0x5d8a9099, 0x594b8d2e, 0x5408abf7, 0x50c9b640,
|
||||
0x4e8ee645, 0x4a4ffbf2, 0x470cdd2b, 0x43cdc09c,
|
||||
0x7b827d21, 0x7f436096, 0x7200464f, 0x76c15bf8,
|
||||
0x68860bfd, 0x6c47164a, 0x61043093, 0x65c52d24,
|
||||
0x119b4be9, 0x155a565e, 0x18197087, 0x1cd86d30,
|
||||
0x029f3d35, 0x065e2082, 0x0b1d065b, 0x0fdc1bec,
|
||||
0x3793a651, 0x3352bbe6, 0x3e119d3f, 0x3ad08088,
|
||||
0x2497d08d, 0x2056cd3a, 0x2d15ebe3, 0x29d4f654,
|
||||
0xc5a92679, 0xc1683bce, 0xcc2b1d17, 0xc8ea00a0,
|
||||
0xd6ad50a5, 0xd26c4d12, 0xdf2f6bcb, 0xdbee767c,
|
||||
0xe3a1cbc1, 0xe760d676, 0xea23f0af, 0xeee2ed18,
|
||||
0xf0a5bd1d, 0xf464a0aa, 0xf9278673, 0xfde69bc4,
|
||||
0x89b8fd09, 0x8d79e0be, 0x803ac667, 0x84fbdbd0,
|
||||
0x9abc8bd5, 0x9e7d9662, 0x933eb0bb, 0x97ffad0c,
|
||||
0xafb010b1, 0xab710d06, 0xa6322bdf, 0xa2f33668,
|
||||
0xbcb4666d, 0xb8757bda, 0xb5365d03, 0xb1f740b4
|
||||
};
|
||||
|
||||
public CRC() {
|
||||
initialiseCRC();
|
||||
}
|
||||
|
||||
void initialiseCRC() {
|
||||
globalCrc = 0xffffffff;
|
||||
}
|
||||
|
||||
int getFinalCRC() {
|
||||
return ~globalCrc;
|
||||
}
|
||||
|
||||
int getGlobalCRC() {
|
||||
return globalCrc;
|
||||
}
|
||||
|
||||
void setGlobalCRC(int newCrc) {
|
||||
globalCrc = newCrc;
|
||||
}
|
||||
|
||||
void updateCRC(int inCh) {
|
||||
int temp = (globalCrc >> 24) ^ inCh;
|
||||
if (temp < 0) {
|
||||
temp = 256 + temp;
|
||||
}
|
||||
globalCrc = (globalCrc << 8) ^ CRC.crc32Table[temp];
|
||||
}
|
||||
|
||||
int globalCrc;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user