- moved all code that contained MIT IP outside (http://code.google.com/p/simile-vicino/)

- moved bzip2 and tar code from apache ant into their own jar files
- now gridworks source contains only com.metaweb.* code everything else is a jar dependency
- started to work on archive importer


git-svn-id: http://google-refine.googlecode.com/svn/trunk@376 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
Stefano Mazzocchi 2010-04-02 23:40:12 +00:00
parent 4eda7ae2c0
commit 72203cd3d5
40 changed files with 153 additions and 4244 deletions

View File

@ -15,11 +15,12 @@
<classpathentry kind="lib" path="lib/arithcode-1.1.jar" sourcepath="lib-src/arithcode-1.1-sources.jar"/> <classpathentry kind="lib" path="lib/arithcode-1.1.jar" sourcepath="lib-src/arithcode-1.1-sources.jar"/>
<classpathentry kind="lib" path="lib/jdatapath-alpha2.jar" sourcepath="lib-src/jdatapath-alpha2-sources.jar"/> <classpathentry kind="lib" path="lib/jdatapath-alpha2.jar" sourcepath="lib-src/jdatapath-alpha2-sources.jar"/>
<classpathentry kind="lib" path="lib/secondstring-20100303.jar" sourcepath="lib-src/secondstring-20100303-sources.jar"/> <classpathentry kind="lib" path="lib/secondstring-20100303.jar" sourcepath="lib-src/secondstring-20100303-sources.jar"/>
<classpathentry kind="lib" path="lib/ant-tools-1.8.0.jar" sourcepath="lib-src/ant-tools-1.8.0-sources.jar"/>
<classpathentry kind="lib" path="lib/vicino-1.1.jar" sourcepath="lib-src/vicino-1.1-sources.jar"/>
<classpathentry kind="lib" path="lib/poi-3.6.jar"/> <classpathentry kind="lib" path="lib/poi-3.6.jar"/>
<classpathentry kind="lib" path="lib/poi-ooxml-3.6.jar"/> <classpathentry kind="lib" path="lib/poi-ooxml-3.6.jar"/>
<classpathentry kind="lib" path="lib/apache-tools-tar.jar"/>
<classpathentry kind="lib" path="tests/lib/junit-4.8.1.jar" sourcepath="tests/lib-src/junit-4.8.1-sources.jar"/>
<classpathentry kind="lib" path="lib/jython-2.5.1.jar"/> <classpathentry kind="lib" path="lib/jython-2.5.1.jar"/>
<classpathentry kind="lib" path="lib/clojure-1.1.0.jar"/> <classpathentry kind="lib" path="lib/clojure-1.1.0.jar"/>
<classpathentry kind="lib" path="tests/lib/junit-4.8.1.jar" sourcepath="tests/lib-src/junit-4.8.1-sources.jar"/>
<classpathentry kind="output" path="build/classes"/> <classpathentry kind="output" path="build/classes"/>
</classpath> </classpath>

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2010, Metaweb Technologies, Inc. All rights reserved. * Copyright (c) 2010 Metaweb Technologies, Inc. All rights reserved.
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions * modification, are permitted provided that the following conditions
@ -36,9 +36,9 @@ See the 'licenses' directory for a list of the licenses for the libraries we dep
ordered here by license: ordered here by license:
licenses/apache2.0.LICENSE.txt licenses/apache2.0.LICENSE.txt
ant (package org.apache.tools.tar)
bzip2 (package org.apache.tools.bzip2)
calendar-parser (package com.metaweb.gridworks.expr.util) calendar-parser (package com.metaweb.gridworks.expr.util)
ant-tools
bzip2
commons-lang commons-lang
commons-codec commons-codec
jdatapath jdatapath
@ -58,7 +58,7 @@ licenses/dom4j.LICENSE.txt (BSD family)
dom4j dom4j
licenses/simile.LICENSE.txt (BSD family) licenses/simile.LICENSE.txt (BSD family)
vicino (package edu.mit.simile.vicino) vicino
licenses/arithcode.LICENSE.txt (BSD family) licenses/arithcode.LICENSE.txt (BSD family)
arithcode arithcode

Binary file not shown.

Binary file not shown.

BIN
lib/ant-tools-1.8.0.jar Normal file

Binary file not shown.

Binary file not shown.

BIN
lib/vicino-1.1.jar Normal file

Binary file not shown.

View File

@ -37,6 +37,7 @@ import com.metaweb.util.threads.ThreadPoolExecutorAdapter;
public class Gridworks { public class Gridworks {
static private String version; static private String version;
static private File tempDir;
private static Logger root = Logger.getRootLogger(); private static Logger root = Logger.getRootLogger();
private static Logger logger = Logger.getLogger("com.metaweb.gridworks"); private static Logger logger = Logger.getLogger("com.metaweb.gridworks");
@ -65,6 +66,10 @@ public class Gridworks {
return version; return version;
} }
public static File getTempFile(String name) {
return new File(tempDir, name);
}
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
// tell jetty to use SLF4J for logging instead of its own stuff // tell jetty to use SLF4J for logging instead of its own stuff
@ -84,6 +89,9 @@ public class Gridworks {
jetty_logger.setLevel(Level.WARN); jetty_logger.setLevel(Level.WARN);
version = Configurations.get("gridworks.version","trunk"); version = Configurations.get("gridworks.version","trunk");
tempDir = new File(Configurations.get("gridworks.temp","temp"));
if (!tempDir.exists()) tempDir.mkdirs();
Gridworks gridworks = new Gridworks(); Gridworks gridworks = new Gridworks();

View File

@ -1,6 +1,9 @@
package com.metaweb.gridworks.commands.edit; package com.metaweb.gridworks.commands.edit;
import java.io.BufferedInputStream; import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.io.InputStreamReader; import java.io.InputStreamReader;
@ -9,12 +12,24 @@ import java.io.StringReader;
import java.io.UnsupportedEncodingException; import java.io.UnsupportedEncodingException;
import java.net.URL; import java.net.URL;
import java.net.URLConnection; import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Properties; import java.util.Properties;
import java.util.Map.Entry;
import java.util.zip.GZIPInputStream;
import javax.servlet.ServletException; import javax.servlet.ServletException;
import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse; import javax.servlet.http.HttpServletResponse;
import org.apache.tools.bzip2.CBZip2InputStream;
import org.apache.tools.tar.TarEntry;
import org.apache.tools.tar.TarInputStream;
import com.ibm.icu.text.CharsetDetector; import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch; import com.ibm.icu.text.CharsetMatch;
import com.metaweb.gridworks.Gridworks; import com.metaweb.gridworks.Gridworks;
@ -91,7 +106,9 @@ public class CreateProjectCommand extends Command {
while ((part = parser.readNextPart()) != null) { while ((part = parser.readNextPart()) != null) {
if (part.isFile()) { if (part.isFile()) {
internalImportFilePart((FilePart) part, project, options);
FilePart filePart = (FilePart) part;
internalImportFile(project, options, filePart.getFileName(), filePart.getInputStream());
} else if (part.isParam()) { } else if (part.isParam()) {
ParamPart paramPart = (ParamPart) part; ParamPart paramPart = (ParamPart) part;
@ -118,15 +135,120 @@ public class CreateProjectCommand extends Command {
} }
} }
protected void internalImportFilePart( protected void internalImportFile(
FilePart filePart,
Project project, Project project,
Properties options Properties options,
String fileName,
InputStream inputStream
) throws Exception { ) throws Exception {
Importer importer = guessImporter(options, null, filePart.getFileName()); if (fileName.endsWith(".tar.gz") || fileName.endsWith(".tar.bz2")) {
// first, save the file on disk, since we need two passes and we might
internalInvokeImporter(project, importer, options, filePart.getInputStream(), null); // not have enough memory to keep it all in there
File file = save(inputStream);
// in the first pass, gather statistics about what files are in there
// unfortunately, we have to rely on files extensions, which is horrible but
// better than nothing
BufferedInputStream stream = new BufferedInputStream(new FileInputStream(file));
InputStream is = (fileName.endsWith(".tar.gz")) ? new GZIPInputStream(stream): new CBZip2InputStream(stream);
TarInputStream tis = new TarInputStream(is);
HashMap<String,Integer> ext_map = new HashMap<String,Integer>();
while (true) {
TarEntry entry = tis.getNextEntry();
if (entry == null) break;
if (!entry.isDirectory()) {
String name = entry.getName();
String ext = getExtension(name)[1];
if (ext_map.containsKey(ext)) {
ext_map.put(ext, ext_map.get(ext) + 1);
} else {
ext_map.put(ext, 1);
}
}
}
stream.close();
// sort extensions by how often they appear
List<Entry<String,Integer>> values = new ArrayList<Entry<String,Integer>>(ext_map.entrySet());
Collections.sort(values, new ValuesComparator());
if (values.size() == 0) {
throw new RuntimeException("The archive contains no files.");
}
// this will contain the set of extensions we'll load from the archive
HashSet<String> exts = new HashSet<String>();
// find the extension that is most frequent or those who share the highest frequency value
Entry<String,Integer> most_frequent = values.get(0);
Entry<String,Integer> second_most_frequent = values.get(1);
if (most_frequent.getValue() > second_most_frequent.getValue()) { // we have a winner
exts.add(most_frequent.getKey());
} else { // multiple extensions have the same frequency
int winning_frequency = most_frequent.getValue();
for (Entry<String,Integer> e : values) {
if (e.getValue() == winning_frequency) {
exts.add(e.getKey());
}
}
}
Gridworks.log("Most frequent extensions: " + exts.toString());
} else if (fileName.endsWith(".zip")) {
} else if (fileName.endsWith(".gz")) {
String[] frags = getExtension(fileName);
internalImportFile(project, options, frags[0], new GZIPInputStream(inputStream));
} else if (fileName.endsWith(".bz2")) {
String[] frags = getExtension(fileName);
internalImportFile(project, options, frags[0], new CBZip2InputStream(inputStream));
} else {
load(project, options, fileName, inputStream);
}
}
public class ValuesComparator implements Comparator<Entry<String,Integer>> {
public int compare(Entry<String,Integer> o1, Entry<String,Integer> o2) {
return o2.getValue() - o1.getValue();
}
}
private void load(Project project, Properties options, String fileName, InputStream inputStream) throws Exception {
Importer importer = guessImporter(options, null, fileName);
internalInvokeImporter(project, importer, options, inputStream, null);
inputStream.close();
}
private File save(InputStream is) throws IOException {
File temp = Gridworks.getTempFile(Long.toString(System.currentTimeMillis()));
temp.deleteOnExit();
copy(is,temp);
is.close();
return temp;
}
private String[] getExtension(String filename) {
String[] result = new String[2];
int ext_index = filename.lastIndexOf(".");
result[0] = (ext_index == -1) ? filename : filename.substring(0,ext_index);
result[1] = (ext_index == -1) ? "" : filename.substring(ext_index + 1);
return result;
}
private static long copy(InputStream input, File file) throws IOException {
FileOutputStream output = new FileOutputStream(file);
byte[] buffer = new byte[4 * 1024];
long count = 0;
int n = 0;
while (-1 != (n = input.read(buffer))) {
output.write(buffer, 0, n);
count += n;
}
output.close();
input.close();
return count;
} }
protected void internalImportURL( protected void internalImportURL(
@ -237,17 +359,9 @@ public class CreateProjectCommand extends Command {
new InputStreamReader(inputStream); new InputStreamReader(inputStream);
} }
try { importer.read(reader, project, options, skip, limit);
importer.read(reader, project, options, skip, limit);
} finally {
reader.close();
}
} else { } else {
try { importer.read(inputStream, project, options, skip, limit);
importer.read(inputStream, project, options, skip, limit);
} finally {
inputStream.close();
}
} }
} }
@ -295,6 +409,12 @@ public class CreateProjectCommand extends Command {
return new ExcelImporter(false); return new ExcelImporter(false);
} else if("application/x-xls".equals(contentType)) { } else if("application/x-xls".equals(contentType)) {
return new ExcelImporter(true); return new ExcelImporter(true);
} else if("application/xml".equals(contentType) ||
"text/xml".equals(contentType) ||
"application/rss+xml".equals(contentType) ||
"application/atom+xml".equals(contentType) ||
"application/rdf+xml".equals(contentType)) {
return new XmlImporter();
} }
} else if (fileName != null) { } else if (fileName != null) {
fileName = fileName.toLowerCase(); fileName = fileName.toLowerCase();

View File

@ -1,79 +0,0 @@
package edu.mit.simile.vicino;
import java.io.Serializable;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import edu.mit.simile.vicino.clustering.Clusterer;
import edu.mit.simile.vicino.clustering.NGramClusterer;
import edu.mit.simile.vicino.clustering.VPTreeClusterer;
import edu.mit.simile.vicino.distances.Distance;
public class Cluster extends Operator {
public static void main(String[] args) throws Exception {
(new Cluster()).init(args);
}
public void init(String[] args) throws Exception {
Distance distance = getDistance(args[0]);
List<String> strings = getStrings(args[1]);
double radius = Double.parseDouble(args[2]);
int blocking_size = Integer.parseInt(args[3]);
long vptree_start = System.currentTimeMillis();
Clusterer vptree_clusterer = new VPTreeClusterer(distance);
for (String s: strings) {
vptree_clusterer.populate(s);
}
List<Set<Serializable>> vptree_clusters = vptree_clusterer.getClusters(radius);
long vptree_elapsed = System.currentTimeMillis() - vptree_start;
int vptree_distances = distance.getCount();
distance.resetCounter();
long ngram_start = System.currentTimeMillis();
Clusterer ngram_clusterer = new NGramClusterer(distance,blocking_size);
for (String s: strings) {
ngram_clusterer.populate(s);
}
List<Set<Serializable>> ngram_clusters = ngram_clusterer.getClusters(radius);
long ngram_elapsed = System.currentTimeMillis() - ngram_start;
int ngram_distances = distance.getCount();
distance.resetCounter();
log("VPTree found " + vptree_clusters.size() + " in " + vptree_elapsed + " ms with " + vptree_distances + " distances\n");
log("NGram found " + ngram_clusters.size() + " in " + ngram_elapsed + " ms with " + ngram_distances + " distances\n");
if (vptree_clusters.size() > ngram_clusters.size()) {
log("VPTree clusterer found these clusters the other method couldn't: ");
diff(vptree_clusters,ngram_clusters);
} else if (ngram_clusters.size() > vptree_clusters.size()) {
log("NGram clusterer found these clusters the other method couldn't: ");
diff(ngram_clusters,vptree_clusters);
}
System.exit(0);
}
private void diff(List<Set<Serializable>> more, List<Set<Serializable>> base) {
Set<Set<Serializable>> holder = new HashSet<Set<Serializable>>(base.size());
for (Set<Serializable> s : base) {
holder.add(s);
}
for (Set<Serializable> s : more) {
if (!holder.contains(s)) {
printCluster(s);
}
}
}
private void printCluster(Set<Serializable> cluster) {
for (Serializable s : cluster) {
log(s.toString());
}
log("");
}
}

View File

@ -1,61 +0,0 @@
package edu.mit.simile.vicino;
import java.util.List;
import edu.mit.simile.vicino.distances.Distance;
public class Distributor extends Operator {
private static final int COLUMNS = 70;
private static final char CHAR = '=';
public static void main(String[] args) throws Exception {
Distance d = getDistance(args[0]);
List<String> strings = getStrings(args[1]);
int buckets = Integer.parseInt(args[2]);
long start = System.currentTimeMillis();
int[] values = new int[buckets];
int size = strings.size();
for (int i = 0; i < size; i++) {
String x = (String) strings.get((int) (Math.random() * size));
String y = (String) strings.get((int) (Math.random() * size));
int dist = (int) (buckets * d.d(x, y));
values[dist]++;
System.out.print(".");
}
System.out.println();
long stop = System.currentTimeMillis();
float m = ((float) (stop - start)) / (float) size;
int maxValue = 0;
for (int i = 0; i < buckets; i++) {
if (values[i] > maxValue) {
maxValue = values[i];
}
}
System.out
.println("+-------------------------------------------------------------------");
for (int i = 0; i < buckets; i++) {
System.out.println("|" + bar(COLUMNS * values[i] / maxValue));
}
System.out
.println("+-------------------------------------------------------------------");
System.out.println("\n Each distance calculation took: " + m + " millis");
}
static private String bar(int value) {
StringBuffer b = new StringBuffer(value);
for (int i = 0; i < value; i++) {
b.append(CHAR);
}
return b.toString();
}
}

View File

@ -1,12 +0,0 @@
package edu.mit.simile.vicino;
import edu.mit.simile.vicino.distances.Distance;
public class Meter extends Operator {
public static void main(String[] args) throws Exception {
Distance d = getDistance(args[0]);
System.out.println(args[1] + " <- " + d.d(args[1], args[2]) + " -> " + args[2]);
}
}

View File

@ -1,94 +0,0 @@
package edu.mit.simile.vicino;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.regex.Pattern;
import com.wcohen.ss.api.Token;
import com.wcohen.ss.api.Tokenizer;
public class NGramTokenizer implements Tokenizer {
private int ngram_size;
public NGramTokenizer(int ngram_size) {
this.ngram_size = ngram_size;
}
public Token[] tokenize(String str) {
str = normalize(str);
List<Token> tokens = new ArrayList<Token>();
for (int i = 0; i < str.length(); i++) {
int index = i + ngram_size;
if (index <= str.length()) {
tokens.add(intern(str.substring(i,index)));
}
}
return (Token[]) tokens.toArray(new BasicToken[tokens.size()]);
}
static final Pattern extra = Pattern.compile("\\p{Cntrl}|\\p{Punct}");
static final Pattern whitespace = Pattern.compile("\\p{Space}+");
private String normalize(String s) {
s = s.trim();
s = extra.matcher(s).replaceAll("");
s = whitespace.matcher(s).replaceAll(" ");
s = s.toLowerCase();
return s.intern();
}
private int nextId = 0;
private Map<String, Token> tokMap = new TreeMap<String, Token>();
public Token intern(String s) {
s = s.toLowerCase().intern();
Token tok = tokMap.get(s);
if (tok == null) {
tok = new BasicToken(++nextId, s);
tokMap.put(s, tok);
}
return tok;
}
public Iterator<Token> tokenIterator() {
return tokMap.values().iterator();
}
public int maxTokenIndex() {
return nextId;
}
public class BasicToken implements Token, Comparable<Token> {
private final int index;
private final String value;
BasicToken(int index, String value) {
this.index = index;
this.value = value;
}
public String getValue() {
return value;
}
public int getIndex() {
return index;
}
public int compareTo(Token t) {
return index - t.getIndex();
}
public int hashCode() {
return value.hashCode();
}
public String toString() {
return "[token#" + getIndex() + ":" + getValue() + "]";
}
}
}

View File

@ -1,47 +0,0 @@
package edu.mit.simile.vicino;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import edu.mit.simile.vicino.distances.Distance;
public class Operator {
static void log(String msg) {
System.out.println(msg);
}
static Distance getDistance(String distance) throws Exception {
return (Distance) Class.forName("edu.mit.simile.vicino.distances." + distance + "Distance").newInstance();
}
static List<String> getStrings(String fileName) throws IOException {
List<String> strings = new ArrayList<String>();
File file = new File(fileName);
if (file.isDirectory()) {
File[] files = file.listFiles();
for (File f : files) {
getStrings(f, strings);
}
} else {
getStrings(file, strings);
}
return strings;
}
static void getStrings(File file, List<String> strings) throws IOException {
BufferedReader input = new BufferedReader(new InputStreamReader(new FileInputStream(file), "UTF-8"));
String line;
while ((line = input.readLine()) != null) {
strings.add(line.trim().intern());
}
input.close();
}
}

View File

@ -1,53 +0,0 @@
package edu.mit.simile.vicino;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.Serializable;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import edu.mit.simile.vicino.distances.Distance;
import edu.mit.simile.vicino.vptree.VPTree;
import edu.mit.simile.vicino.vptree.VPTreeBuilder;
import edu.mit.simile.vicino.vptree.VPTreeSeeker;
public class Seeker extends Operator {
public static void main(String[] args) throws Exception {
Distance d = getDistance(args[0]);
log("Working with distance: " + d);
List<String> strings = getStrings(args[1]);
log("Obtained " + strings.size() + " from " + args[1]);
log("Building VPTree...");
VPTreeBuilder builder = new VPTreeBuilder(d);
VPTree tree = builder.buildVPTree(strings);
log("..done");
VPTreeSeeker seeker = new VPTreeSeeker(d, tree);
log("type a string|range then hit return:");
BufferedReader input = new BufferedReader(new InputStreamReader(System.in));
String line = null;
while ((line = input.readLine()) != null) {
int index = line.indexOf('|');
String query = line.substring(0, index);
float range = Float.parseFloat(line.substring(index + 1));
long start = System.currentTimeMillis();
Set<Serializable> results = seeker.range(query, range);
long stop = System.currentTimeMillis();
Iterator<Serializable> j = results.iterator();
if (j.hasNext()) {
while (j.hasNext()) {
String r = (String) j.next();
log(" " + r);
}
log(" [done in " + (stop - start) + "ms]");
} else {
log(" [no results found in " + (stop - start) + "ms]");
}
}
}
}

View File

@ -1,46 +0,0 @@
package edu.mit.simile.vicino;
import java.util.List;
import edu.mit.simile.vicino.distances.Distance;
public class Tester extends Operator {
public static void main(String[] args) throws Exception {
Distance d = getDistance(args[0]);
List<String> strings = getStrings(args[1]);
long start = System.currentTimeMillis();
int size = strings.size();
for (int i = 0; i < size * size * size; i++) {
String x = (String) strings.get((int) (Math.random() * size));
String y = (String) strings.get((int) (Math.random() * size));
String z = (String) strings.get((int) (Math.random() * size));
boolean metric = metric(x, y, z, d);
if (metric) {
System.out.println("metric");
} else {
System.out.println("***** NOT METRIC *****");
}
}
long stop = System.currentTimeMillis();
float m = ((float) (stop - start)) / (float) size;
System.out.println("\n Each metric evaluation took: " + m + " millis");
}
static boolean metric(String x, String y, String z, Distance d) {
double dxx = d.d(x, x);
boolean identity = (dxx == 0.0f);
double dxy = d.d(x, y);
double dyx = d.d(y, x);
boolean simmetrical = (dxy == dyx);
double dxz = d.d(x, z);
double dyz = d.d(y, z);
boolean triangular = (dxz <= dxy + dyz);
return (identity && simmetrical && triangular);
}
}

View File

@ -1,20 +0,0 @@
package edu.mit.simile.vicino.clustering;
import java.io.Serializable;
import java.util.Comparator;
import java.util.List;
import java.util.Set;
public abstract class Clusterer {
public class SizeComparator implements Comparator<Set<Serializable>> {
public int compare(Set<Serializable> o1, Set<Serializable> o2) {
return o2.size() - o1.size();
}
}
public abstract void populate(String s);
public abstract List<Set<Serializable>> getClusters(double radius);
}

View File

@ -1,194 +0,0 @@
package edu.mit.simile.vicino.clustering;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.Map.Entry;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import com.wcohen.ss.api.Token;
import edu.mit.simile.vicino.NGramTokenizer;
import edu.mit.simile.vicino.distances.Distance;
public class NGramClusterer extends Clusterer {
NGramTokenizer _tokenizer;
Distance _distance;
Map<String,Set<String>> blocks = new HashMap<String,Set<String>>();
public NGramClusterer(Distance d, int blockSize) {
_tokenizer = new NGramTokenizer(blockSize);
_distance = d;
}
public void populate(String s) {
Token[] tokens = _tokenizer.tokenize(s);
for (Token t : tokens) {
String ss = t.getValue();
Set<String> l = null;
if (!blocks.containsKey(ss)) {
l = new TreeSet<String>();
blocks.put(ss, l);
} else {
l = blocks.get(ss);
}
l.add(s);
}
}
public class BlockEvaluator implements Callable<Map<Serializable,Set<Serializable>>> {
int start;
int stop;
double radius;
List<Set<String>> blocks;
Map<Serializable,Set<Serializable>> cluster_map;
public BlockEvaluator(List<Set<String>> blocks, double radius, int start, int stop) {
this.blocks = blocks;
this.start = start;
this.stop = stop;
this.radius = radius;
}
public Map<Serializable,Set<Serializable>> call() {
Map<Serializable,Set<Serializable>> cluster_map = new HashMap<Serializable,Set<Serializable>>();
for (int i = start; i < stop; i++) {
Set<String> set = blocks.get(i);
if (set.size() < 2) continue;
for (String a : set) {
for (String b : set) {
if (a == b) continue;
if (cluster_map.containsKey(a) && cluster_map.get(a).contains(b)) continue;
if (cluster_map.containsKey(b) && cluster_map.get(b).contains(a)) continue;
double d = _distance.d(a,b);
if (d <= radius || radius < 0) {
Set<Serializable> l = null;
if (!cluster_map.containsKey(a)) {
l = new TreeSet<Serializable>();
l.add(a);
cluster_map.put(a, l);
} else {
l = cluster_map.get(a);
}
l.add(b);
}
}
}
}
return cluster_map;
}
}
private static final ExecutorService executor = Executors.newCachedThreadPool();
private static final boolean MULTITHREADED = true;
public List<Set<Serializable>> getClusters(double radius) {
if (MULTITHREADED) {
return getClustersMultiThread(radius);
} else {
return getClustersSingleThread(radius);
}
}
public List<Set<Serializable>> getClustersMultiThread(double radius) {
int cores = Runtime.getRuntime().availableProcessors();
int size = blocks.size();
int range = size / cores + 1;
List<Map<Serializable,Set<Serializable>>> cluster_maps = new ArrayList<Map<Serializable,Set<Serializable>>>(cores);
List<BlockEvaluator> evaluators = new ArrayList<BlockEvaluator>(cores);
for (int i = 0; i < cores; i++) {
int range_start = range * i;
int range_end = range * (i + 1);
if (range_end > size) range_end = size;
evaluators.add(new BlockEvaluator(new ArrayList<Set<String>>(blocks.values()),radius,range_start,range_end));
}
try {
List<Future<Map<Serializable,Set<Serializable>>>> futures = executor.invokeAll(evaluators);
for (Future<Map<Serializable,Set<Serializable>>> future : futures) {
cluster_maps.add(future.get());
}
} catch (InterruptedException e1) {
e1.printStackTrace();
} catch (ExecutionException e) {
e.printStackTrace();
}
Set<Set<Serializable>> clusters = new HashSet<Set<Serializable>>();
for (Map<Serializable,Set<Serializable>> cluster_map : cluster_maps) {
for (Entry<Serializable,Set<Serializable>> e : cluster_map.entrySet()) {
Set<Serializable> v = e.getValue();
if (v.size() > 1) {
clusters.add(v);
}
}
}
List<Set<Serializable>> sorted_clusters = new ArrayList<Set<Serializable>>(clusters);
Collections.sort(sorted_clusters, new SizeComparator());
return sorted_clusters;
}
public List<Set<Serializable>> getClustersSingleThread(double radius) {
Map<Serializable,Set<Serializable>> cluster_map = new HashMap<Serializable,Set<Serializable>>();
for (Set<String> set : blocks.values()) {
if (set.size() < 2) continue;
for (String a : set) {
for (String b : set) {
if (a == b) continue;
if (cluster_map.containsKey(a) && cluster_map.get(a).contains(b)) continue;
if (cluster_map.containsKey(b) && cluster_map.get(b).contains(a)) continue;
double d = _distance.d(a,b);
if (d <= radius || radius < 0) {
Set<Serializable> l = null;
if (!cluster_map.containsKey(a)) {
l = new TreeSet<Serializable>();
l.add(a);
cluster_map.put(a, l);
} else {
l = cluster_map.get(a);
}
l.add(b);
}
}
}
}
List<Set<Serializable>> clusters = new ArrayList<Set<Serializable>>();
for (Entry<Serializable,Set<Serializable>> e : cluster_map.entrySet()) {
Set<Serializable> v = e.getValue();
if (v.size() > 1) {
clusters.add(v);
}
}
Collections.sort(clusters, new SizeComparator());
return clusters;
}
}

View File

@ -1,63 +0,0 @@
package edu.mit.simile.vicino.clustering;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import edu.mit.simile.vicino.distances.Distance;
import edu.mit.simile.vicino.vptree.Node;
import edu.mit.simile.vicino.vptree.VPTree;
import edu.mit.simile.vicino.vptree.VPTreeBuilder;
import edu.mit.simile.vicino.vptree.VPTreeSeeker;
public class VPTreeClusterer extends Clusterer {
VPTreeBuilder _treeBuilder;
Distance _distance;
public VPTreeClusterer(Distance d) {
_distance = d;
_treeBuilder = new VPTreeBuilder(d);
}
public void populate(String s) {
_treeBuilder.populate(s);
}
public List<Set<Serializable>> getClusters(double radius) {
VPTree tree = _treeBuilder.buildVPTree();
System.out.println("distances after the tree: " + _distance.getCount());
Set<Node> nodes = _treeBuilder.getNodes();
VPTreeSeeker seeker = new VPTreeSeeker(_distance,tree);
Map<Serializable,Boolean> flags = new HashMap<Serializable,Boolean>();
for (Node n : nodes) {
flags.put(n.get(), true);
}
Map<Serializable,Set<Serializable>> map = new HashMap<Serializable,Set<Serializable>>();
for (Node n : nodes) {
Serializable s = n.get();
if (flags.get(s)) {
Set<Serializable> results = seeker.range(s, radius);
for (Serializable ss : results) {
flags.put(ss, false);
}
if (results.size() > 1) {
map.put(s, results);
}
}
}
List<Set<Serializable>> clusters = new ArrayList<Set<Serializable>>(map.values());
Collections.sort(clusters, new SizeComparator());
return clusters;
}
}

View File

@ -1,26 +0,0 @@
package edu.mit.simile.vicino.distances;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import org.apache.tools.bzip2.CBZip2OutputStream;
public class BZip2Distance extends PseudoMetricDistance {
public double d2(String x, String y) {
String str = x + y;
double result = 0.0f;
try {
ByteArrayOutputStream baos = new ByteArrayOutputStream(str.length());
CBZip2OutputStream os = new CBZip2OutputStream(baos);
os.write(str.getBytes());
os.close();
baos.close();
result = baos.toByteArray().length;
} catch (IOException e) {
e.printStackTrace();
}
return result;
}
}

View File

@ -1,17 +0,0 @@
package edu.mit.simile.vicino.distances;
public abstract class Distance {
int counter = 0;
public int getCount() {
return counter;
}
public void resetCounter() {
counter = 0;
}
public abstract double d(String x, String y);
}

View File

@ -1,26 +0,0 @@
package edu.mit.simile.vicino.distances;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.zip.GZIPOutputStream;
public class GZipDistance extends PseudoMetricDistance {
public double d2(String x, String y) {
String str = x + y;
double result = 0.0f;
try {
ByteArrayOutputStream baos = new ByteArrayOutputStream(str.length());
GZIPOutputStream os = new GZIPOutputStream(baos);
os.write(str.getBytes());
os.close();
baos.close();
result = baos.toByteArray().length;
} catch (IOException e) {
e.printStackTrace();
}
return result;
}
}

View File

@ -1,18 +0,0 @@
package edu.mit.simile.vicino.distances;
import com.wcohen.ss.Jaccard;
import com.wcohen.ss.api.StringDistance;
public class JaccardDistance extends MetricDistance {
StringDistance distance;
public JaccardDistance() {
this.distance = new Jaccard();
}
protected double d2(String x, String y) {
return this.distance.score(x, y);
}
}

View File

@ -1,18 +0,0 @@
package edu.mit.simile.vicino.distances;
import com.wcohen.ss.Jaro;
import com.wcohen.ss.api.StringDistance;
public class JaroDistance extends MetricDistance {
StringDistance distance;
public JaroDistance() {
this.distance = new Jaro();
}
protected double d2(String x, String y) {
return this.distance.score(x, y);
}
}

View File

@ -1,18 +0,0 @@
package edu.mit.simile.vicino.distances;
import com.wcohen.ss.JaroWinkler;
import com.wcohen.ss.api.StringDistance;
public class JaroWinklerDistance extends MetricDistance {
StringDistance distance;
public JaroWinklerDistance() {
this.distance = new JaroWinkler();
}
protected double d2(String x, String y) {
return this.distance.score(x, y);
}
}

View File

@ -1,18 +0,0 @@
package edu.mit.simile.vicino.distances;
import com.wcohen.ss.JaroWinklerTFIDF;
import com.wcohen.ss.api.StringDistance;
public class JaroWinklerTFIDFDistance extends MetricDistance {
StringDistance distance;
public JaroWinklerTFIDFDistance() {
this.distance = new JaroWinklerTFIDF();
}
protected double d2(String x, String y) {
return this.distance.score(x, y);
}
}

View File

@ -1,18 +0,0 @@
package edu.mit.simile.vicino.distances;
import com.wcohen.ss.Levenstein;
import com.wcohen.ss.api.StringDistance;
public class LevenshteinDistance extends MetricDistance {
StringDistance distance;
public LevenshteinDistance() {
this.distance = new Levenstein();
}
public double d2(String x, String y) {
return Math.abs(this.distance.score(x, y));
}
}

View File

@ -1,24 +0,0 @@
package edu.mit.simile.vicino.distances;
public abstract class MetricDistance extends Distance {
/*
* public float d(String x,String y) {
* float dxy = d2(x,y);
* float dx = d2(x,"");
* float dy = d2(y,"");
* float result = dxy / (dx + dy);
* return result;
* }
*/
public double d(String x, String y) {
double result = d2(x, y);
counter += 1;
return result;
}
abstract double d2(String x, String y);
}

View File

@ -1,27 +0,0 @@
package edu.mit.simile.vicino.distances;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import com.colloquial.arithcode.ArithCodeOutputStream;
import com.colloquial.arithcode.PPMModel;
public class PPMDistance extends PseudoMetricDistance {
public double d2(String x, String y) {
String str = x + y;
double result = 0.0f;
try {
ByteArrayOutputStream baos = new ByteArrayOutputStream(str.length());
ArithCodeOutputStream os = new ArithCodeOutputStream(baos,new PPMModel(8));
os.write(str.getBytes());
os.close();
baos.close();
result = baos.toByteArray().length;
} catch (IOException e) {
e.printStackTrace();
}
return result;
}
}

View File

@ -1,16 +0,0 @@
package edu.mit.simile.vicino.distances;
public abstract class PseudoMetricDistance extends Distance {
public double d(String x, String y) {
double cxx = d2(x, x);
double cyy = d2(y, y);
double cxy = d2(x, y);
double cyx = d2(y, x);
counter += 4;
return 10.0d * ((cxy + cyx) / (cxx + cyy) - 1.0d);
}
protected abstract double d2(String x, String y);
}

View File

@ -1,58 +0,0 @@
package edu.mit.simile.vicino.vptree;
import java.io.Serializable;
/**
* This class represent a couple (Object, distance) value of that Object from
* the Vp in each step of the algorithm.
*
* @author Paolo Ciccarese
*/
public class Node implements Serializable {
private static final long serialVersionUID = -2077473220894258550L;
private final Serializable obj;
private double distance;
public Node(Serializable obj, int i) {
this.obj = obj;
this.distance = i;
}
public Node(Serializable obj) {
this.obj = obj;
}
public Serializable get() {
return this.obj;
}
public void setDistance(double distance) {
this.distance = distance;
}
public double getDistance() {
return distance;
}
public String toString() {
return obj.toString();
}
@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (o instanceof Node) {
return ((Node) o).get().equals(this.obj);
}
return false;
}
@Override
public int hashCode() {
return this.obj.hashCode();
}
}

View File

@ -1,94 +0,0 @@
package edu.mit.simile.vicino.vptree;
public class NodeSorter {
/**
* Sorts and array of objects.
*/
public void sort(Node nodes[]) {
NodeSorter.sort(nodes, 0, nodes.length - 1);
}
/**
* Sort array of Objects using the QuickSort algorithm.
*
* @param s
* An Object[].
* @param lo
* The current lower bound.
* @param hi
* The current upper bound.
*/
public static void sort(Node nodes[], int lo, int hi) {
if (lo >= hi) {
return;
}
/*
* Use median-of-three(lo, mid, hi) to pick a partition. Also swap them
* into relative order while we are at it.
*/
int mid = (lo + hi) / 2;
if (nodes[lo].getDistance() > nodes[mid].getDistance()) {
// Swap.
Node tmp = nodes[lo];
nodes[lo] = nodes[mid];
nodes[mid] = tmp;
}
if (nodes[mid].getDistance() > nodes[hi].getDistance()) {
// Swap .
Node tmp = nodes[mid];
nodes[mid] = nodes[hi];
nodes[hi] = tmp;
if (nodes[lo].getDistance() > nodes[mid].getDistance()) {
// Swap.
Node tmp2 = nodes[lo];
nodes[lo] = nodes[mid];
nodes[mid] = tmp2;
}
}
// Start one past lo since already handled lo.
int left = lo + 1;
// Similarly, end one before hi since already handled hi.
int right = hi - 1;
// If there are three or fewer elements, we are done.
if (left >= right) {
return;
}
Node partition = nodes[mid];
while (true) {
while (nodes[right].getDistance() > partition.getDistance()) {
--right;
}
while (left < right && nodes[left].getDistance() <= partition.getDistance()) {
++left;
}
if (left < right) {
// Swap.
Node tmp = nodes[left];
nodes[left] = nodes[right];
nodes[right] = tmp;
--right;
} else {
break;
}
}
sort(nodes, lo, left);
sort(nodes, left + 1, hi);
}
}

View File

@ -1,56 +0,0 @@
package edu.mit.simile.vicino.vptree;
import java.io.Serializable;
/**
* @author Paolo Ciccarese
*/
public class TNode implements Serializable {
private static final long serialVersionUID = -217604190976851241L;
private final Serializable obj;
private double median;
private TNode left;
private TNode right;
/**
* The Object will be fixed during the instantiation of the node, while the
* children will be defined in another iteration of the algorithm,
*/
public TNode(Serializable obj) {
this.obj = obj;
}
public Serializable get() {
return this.obj;
}
public void setMedian(double median) {
this.median = median;
}
public double getMedian() {
return median;
}
public void setLeft(TNode leftNode) {
this.left = leftNode;
}
public TNode getLeft() {
return left;
}
public void setRight(TNode rightNode) {
this.right = rightNode;
}
public TNode getRight() {
return right;
}
public String toString() {
return this.obj.toString();
}
}

View File

@ -1,33 +0,0 @@
package edu.mit.simile.vicino.vptree;
import java.io.Serializable;
/**
* The VPTree class.
*
* @author Paolo Ciccarese
*/
public class VPTree implements Serializable {
private static final long serialVersionUID = 1291056732155841123L;
private TNode root;
/**
* Sets the root of the VPTree.
*
* @param root The VPTree root.
*/
public void setRoot(TNode root) {
this.root = root;
}
/**
* Get the root of the VPTree.
*
* @return The VPTree root.
*/
public TNode getRoot() {
return root;
}
}

View File

@ -1,192 +0,0 @@
package edu.mit.simile.vicino.vptree;
import java.io.Serializable;
import java.util.Collection;
import java.util.HashSet;
import java.util.Random;
import java.util.Set;
import edu.mit.simile.vicino.distances.Distance;
/**
* @author Paolo Ciccarese
* @author Stefano Mazzocchi
*/
public class VPTreeBuilder {
private static final boolean DEBUG = false;
private static final boolean OPTIMIZED = false;
private static final int sample_size = 10;
private Random generator = new Random(System.currentTimeMillis());
private final Distance distance;
private Set<Node> nodes = new HashSet<Node>();
/**
* Defines a VPTree Builder for a specific distance.
*
* @param distance
* The class implementing the distance.
*/
public VPTreeBuilder(Distance distance) {
this.distance = distance;
}
public Set<Node> getNodes() {
return this.nodes;
}
public void populate(Serializable s) {
nodes.add(new Node(s));
}
public VPTree buildVPTree() {
if (DEBUG) {
for (Node n : this.nodes) {
System.out.println(n.get().toString());
}
System.out.println();
}
Node[] nodes_array = this.nodes.toArray(new Node[this.nodes.size()]);
VPTree tree = new VPTree();
if (nodes_array.length > 0) {
tree.setRoot(makeNode(nodes_array, 0, nodes_array.length - 1));
}
return tree;
}
public VPTree buildVPTree(Collection<? extends Serializable> values) {
reset();
for (Serializable s : values) {
populate(s);
}
return buildVPTree();
}
public void reset() {
this.nodes.clear();
}
private TNode makeNode(Node nodes[], int begin, int end) {
int delta = end - begin;
if (DEBUG) System.out.println("\ndelta: " + delta);
if (delta == 0) {
TNode vpNode = new TNode(nodes[begin].get());
vpNode.setMedian(0);
return vpNode;
} else if (delta < 0) {
return null;
}
Node randomNode = getVantagePoint(nodes, begin, end);
TNode vpNode = new TNode(randomNode.get());
if (DEBUG) System.out.println("\nvp-node: " + vpNode.get().toString());
calculateDistances(vpNode, nodes, begin, end);
orderDistances(nodes, begin, end);
fixVantagPoint(randomNode, nodes, begin, end);
if (DEBUG) {
for (int i = begin; i <= end; i++) {
System.out.println(" +-- " + nodes[i].getDistance() + " --> " + nodes[i].get());
}
}
float median = (float) median(nodes, begin, end);
vpNode.setMedian(median);
int i = 0;
for (i = begin + 1; i < end; i++) {
if (nodes[i].getDistance() >= median) {
vpNode.setLeft(makeNode(nodes, begin + 1, i - 1));
break;
}
}
vpNode.setRight(makeNode(nodes, i, end));
return vpNode;
}
private Node getVantagePoint(Node nodes[], int begin, int end) {
if (OPTIMIZED) {
Node buffer[] = new Node[sample_size];
for (int i = 0; i < sample_size; i++) {
buffer[i] = getRandomNode(nodes,begin,end);
}
double bestSpread = 0;
Node bestNode = buffer[0];
for (int i = 0; i < sample_size; i++) {
calculateDistances(new TNode(buffer[i]), buffer, 0, buffer.length - 1);
orderDistances(nodes, begin, end);
double median = (double) median(nodes, begin, end);
double spread = deviation(buffer, median);
System.out.println(" " + spread);
if (spread > bestSpread) {
bestSpread = spread;
bestNode = buffer[i];
}
}
System.out.println("best: " + bestSpread);
return bestNode;
} else {
return getRandomNode(nodes,begin,end);
}
}
private Node getRandomNode(Node nodes[], int begin, int end) {
return nodes[begin + generator.nextInt(end - begin)];
}
private double deviation(Node buffer[], double median) {
double sum = 0;
for (int i = 0; i < buffer.length; i++) {
sum += Math.pow(buffer[i].getDistance() - median, 2);
}
return sum / buffer.length;
}
public double median(Node nodes[], int begin, int end) {
int delta = end - begin;
int middle = delta / 2;
if (delta % 2 == 0) {
return nodes[begin + middle].getDistance();
} else {
return (nodes[begin + middle].getDistance() + nodes[begin + middle + 1].getDistance()) / 2.0d;
}
}
private void calculateDistances(TNode pivot, Node nodes[], int begin, int end) {
Serializable x = pivot.get();
for (int i = begin; i <= end; i++) {
Serializable y = nodes[i].get();
double d = (x == y || x.equals(y)) ? 0.0d : distance.d(x.toString(), y.toString());
nodes[i].setDistance(d);
}
}
private void fixVantagPoint(Node pivot, Node nodes[], int begin, int end) {
for (int i = begin; i < end; i++) {
if (nodes[i] == pivot) {
if (i > begin) {
Node tmp = nodes[begin];
nodes[begin] = pivot;
nodes[i] = tmp;
break;
}
}
}
}
private void orderDistances(Node nodes[], int begin, int end) {
NodeSorter.sort(nodes, begin, end);
}
}

View File

@ -1,59 +0,0 @@
package edu.mit.simile.vicino.vptree;
import java.io.Serializable;
import java.util.HashSet;
import java.util.Set;
import edu.mit.simile.vicino.distances.Distance;
/**
* @author Paolo Ciccarese
*/
public class VPTreeSeeker {
private static final boolean DEBUG = false;
VPTree tree;
Distance distance;
public VPTreeSeeker(Distance distance, VPTree tree) {
this.distance = distance;
this.tree = tree;
}
public Set<Serializable> range(Serializable query, double range) {
if (DEBUG) System.out.println("--------------- " + query + " " + range);
return rangeTraversal(query, range, tree.getRoot(), new HashSet<Serializable>());
}
private Set<Serializable> rangeTraversal(Serializable query, double range, TNode tNode, Set<Serializable> results) {
if (DEBUG) System.out.println("> " + tNode);
if (tNode != null) {
double distance = this.distance.d(query.toString(), tNode.get().toString());
if (distance <= range) {
if (DEBUG) System.out.println("*** add ***");
results.add(tNode.get());
}
if ((distance + range) < tNode.getMedian()) {
if (DEBUG) System.out.println("left: " + distance + " + " + range + " < " + tNode.getMedian());
rangeTraversal(query, range, tNode.getLeft(), results);
} else if ((distance - range) > tNode.getMedian()) {
if (DEBUG) System.out.println("right: " + distance + " + " + range + " > " + tNode.getMedian());
rangeTraversal(query, range, tNode.getRight(), results);
} else {
if (DEBUG) System.out.println("left & right: " + distance + " + " + range + " = " + tNode.getMedian());
rangeTraversal(query, range, tNode.getLeft(), results);
rangeTraversal(query, range, tNode.getRight(), results);
}
}
if (DEBUG) System.out.println("< " + tNode);
return results;
}
}

View File

@ -1,136 +0,0 @@
/*
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution, if
* any, must include the following acknowlegement:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowlegement may appear in the software itself,
* if and wherever such third-party acknowlegements normally appear.
*
* 4. The names "Ant" and "Apache Software
* Foundation" must not be used to endorse or promote products derived
* from this software without prior written permission. For written
* permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache"
* nor may "Apache" appear in their names without prior written
* permission of the Apache Group.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
/*
* This package is based on the work done by Keiron Liddle, Aftex Software
* <keiron@aftexsw.com> to whom the Ant project is very grateful for his
* great code.
*/
package org.apache.tools.bzip2;
/**
* Base class for both the compress and decompress classes.
* Holds common arrays, and static data.
*
* @author <a href="mailto:keiron@aftexsw.com">Keiron Liddle</a>
*/
public interface BZip2Constants {
int baseBlockSize = 100000;
int MAX_ALPHA_SIZE = 258;
int MAX_CODE_LEN = 23;
int RUNA = 0;
int RUNB = 1;
int N_GROUPS = 6;
int G_SIZE = 50;
int N_ITERS = 4;
int MAX_SELECTORS = (2 + (900000 / G_SIZE));
int NUM_OVERSHOOT_BYTES = 20;
int[] rNums = {
619, 720, 127, 481, 931, 816, 813, 233, 566, 247,
985, 724, 205, 454, 863, 491, 741, 242, 949, 214,
733, 859, 335, 708, 621, 574, 73, 654, 730, 472,
419, 436, 278, 496, 867, 210, 399, 680, 480, 51,
878, 465, 811, 169, 869, 675, 611, 697, 867, 561,
862, 687, 507, 283, 482, 129, 807, 591, 733, 623,
150, 238, 59, 379, 684, 877, 625, 169, 643, 105,
170, 607, 520, 932, 727, 476, 693, 425, 174, 647,
73, 122, 335, 530, 442, 853, 695, 249, 445, 515,
909, 545, 703, 919, 874, 474, 882, 500, 594, 612,
641, 801, 220, 162, 819, 984, 589, 513, 495, 799,
161, 604, 958, 533, 221, 400, 386, 867, 600, 782,
382, 596, 414, 171, 516, 375, 682, 485, 911, 276,
98, 553, 163, 354, 666, 933, 424, 341, 533, 870,
227, 730, 475, 186, 263, 647, 537, 686, 600, 224,
469, 68, 770, 919, 190, 373, 294, 822, 808, 206,
184, 943, 795, 384, 383, 461, 404, 758, 839, 887,
715, 67, 618, 276, 204, 918, 873, 777, 604, 560,
951, 160, 578, 722, 79, 804, 96, 409, 713, 940,
652, 934, 970, 447, 318, 353, 859, 672, 112, 785,
645, 863, 803, 350, 139, 93, 354, 99, 820, 908,
609, 772, 154, 274, 580, 184, 79, 626, 630, 742,
653, 282, 762, 623, 680, 81, 927, 626, 789, 125,
411, 521, 938, 300, 821, 78, 343, 175, 128, 250,
170, 774, 972, 275, 999, 639, 495, 78, 352, 126,
857, 956, 358, 619, 580, 124, 737, 594, 701, 612,
669, 112, 134, 694, 363, 992, 809, 743, 168, 974,
944, 375, 748, 52, 600, 747, 642, 182, 862, 81,
344, 805, 988, 739, 511, 655, 814, 334, 249, 515,
897, 955, 664, 981, 649, 113, 974, 459, 893, 228,
433, 837, 553, 268, 926, 240, 102, 654, 459, 51,
686, 754, 806, 760, 493, 403, 415, 394, 687, 700,
946, 670, 656, 610, 738, 392, 760, 799, 887, 653,
978, 321, 576, 617, 626, 502, 894, 679, 243, 440,
680, 879, 194, 572, 640, 724, 926, 56, 204, 700,
707, 151, 457, 449, 797, 195, 791, 558, 945, 679,
297, 59, 87, 824, 713, 663, 412, 693, 342, 606,
134, 108, 571, 364, 631, 212, 174, 643, 304, 329,
343, 97, 430, 751, 497, 314, 983, 374, 822, 928,
140, 206, 73, 263, 980, 736, 876, 478, 430, 305,
170, 514, 364, 692, 829, 82, 855, 953, 676, 246,
369, 970, 294, 750, 807, 827, 150, 790, 288, 923,
804, 378, 215, 828, 592, 281, 565, 555, 710, 82,
896, 831, 547, 261, 524, 462, 293, 465, 502, 56,
661, 821, 976, 991, 658, 869, 905, 758, 745, 193,
768, 550, 608, 933, 378, 286, 215, 979, 792, 961,
61, 688, 793, 644, 986, 403, 106, 366, 905, 644,
372, 567, 466, 434, 645, 210, 389, 550, 919, 135,
780, 773, 635, 389, 707, 100, 626, 958, 165, 504,
920, 176, 193, 713, 857, 265, 203, 50, 668, 108,
645, 990, 626, 197, 510, 357, 358, 850, 858, 364,
936, 638
};
}

View File

@ -1,865 +0,0 @@
/*
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001-2003 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution, if
* any, must include the following acknowlegement:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowlegement may appear in the software itself,
* if and wherever such third-party acknowlegements normally appear.
*
* 4. The names "Ant" and "Apache Software
* Foundation" must not be used to endorse or promote products derived
* from this software without prior written permission. For written
* permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache"
* nor may "Apache" appear in their names without prior written
* permission of the Apache Group.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
/*
* This package is based on the work done by Keiron Liddle, Aftex Software
* <keiron@aftexsw.com> to whom the Ant project is very grateful for his
* great code.
*/
package org.apache.tools.bzip2;
import java.io.IOException;
import java.io.InputStream;
/**
* An input stream that decompresses from the BZip2 format (without the file
* header chars) to be read as any other stream.
*
* @author <a href="mailto:keiron@aftexsw.com">Keiron Liddle</a>
*/
public class CBZip2InputStream extends InputStream implements BZip2Constants {
private static void cadvise() {
System.out.println("CRC Error");
//throw new CCoruptionError();
}
private static void compressedStreamEOF() {
cadvise();
}
private void makeMaps() {
int i;
nInUse = 0;
for (i = 0; i < 256; i++) {
if (inUse[i]) {
seqToUnseq[nInUse] = (char) i;
unseqToSeq[i] = (char) nInUse;
nInUse++;
}
}
}
/*
index of the last char in the block, so
the block size == last + 1.
*/
private int last;
/*
index in zptr[] of original string after sorting.
*/
private int origPtr;
/*
always: in the range 0 .. 9.
The current block size is 100000 * this number.
*/
private int blockSize100k;
private boolean blockRandomised;
private int bsBuff;
private int bsLive;
private CRC mCrc = new CRC();
private boolean[] inUse = new boolean[256];
private int nInUse;
private char[] seqToUnseq = new char[256];
private char[] unseqToSeq = new char[256];
private char[] selector = new char[MAX_SELECTORS];
private char[] selectorMtf = new char[MAX_SELECTORS];
private int[] tt;
private char[] ll8;
/*
freq table collected to save a pass over the data
during decompression.
*/
private int[] unzftab = new int[256];
private int[][] limit = new int[N_GROUPS][MAX_ALPHA_SIZE];
private int[][] base = new int[N_GROUPS][MAX_ALPHA_SIZE];
private int[][] perm = new int[N_GROUPS][MAX_ALPHA_SIZE];
private int[] minLens = new int[N_GROUPS];
private InputStream bsStream;
private boolean streamEnd = false;
private int currentChar = -1;
private static final int START_BLOCK_STATE = 1;
private static final int RAND_PART_A_STATE = 2;
private static final int RAND_PART_B_STATE = 3;
private static final int RAND_PART_C_STATE = 4;
private static final int NO_RAND_PART_A_STATE = 5;
private static final int NO_RAND_PART_B_STATE = 6;
private static final int NO_RAND_PART_C_STATE = 7;
private int currentState = START_BLOCK_STATE;
private int storedBlockCRC, storedCombinedCRC;
private int computedBlockCRC, computedCombinedCRC;
int i2, count, chPrev, ch2;
int i, tPos;
int rNToGo = 0;
int rTPos = 0;
int j2;
char z;
public CBZip2InputStream(InputStream zStream) {
ll8 = null;
tt = null;
bsSetStream(zStream);
initialize();
initBlock();
setupBlock();
}
public int read() {
if (streamEnd) {
return -1;
} else {
int retChar = currentChar;
switch(currentState) {
case START_BLOCK_STATE:
break;
case RAND_PART_A_STATE:
break;
case RAND_PART_B_STATE:
setupRandPartB();
break;
case RAND_PART_C_STATE:
setupRandPartC();
break;
case NO_RAND_PART_A_STATE:
break;
case NO_RAND_PART_B_STATE:
setupNoRandPartB();
break;
case NO_RAND_PART_C_STATE:
setupNoRandPartC();
break;
default:
break;
}
return retChar;
}
}
private void initialize() {
char magic3, magic4;
magic3 = bsGetUChar();
magic4 = bsGetUChar();
if (magic3 != 'h' || magic4 < '1' || magic4 > '9') {
bsFinishedWithStream();
streamEnd = true;
return;
}
setDecompressStructureSizes(magic4 - '0');
computedCombinedCRC = 0;
}
private void initBlock() {
char magic1, magic2, magic3, magic4;
char magic5, magic6;
magic1 = bsGetUChar();
magic2 = bsGetUChar();
magic3 = bsGetUChar();
magic4 = bsGetUChar();
magic5 = bsGetUChar();
magic6 = bsGetUChar();
if (magic1 == 0x17 && magic2 == 0x72 && magic3 == 0x45
&& magic4 == 0x38 && magic5 == 0x50 && magic6 == 0x90) {
complete();
return;
}
if (magic1 != 0x31 || magic2 != 0x41 || magic3 != 0x59
|| magic4 != 0x26 || magic5 != 0x53 || magic6 != 0x59) {
badBlockHeader();
streamEnd = true;
return;
}
storedBlockCRC = bsGetInt32();
if (bsR(1) == 1) {
blockRandomised = true;
} else {
blockRandomised = false;
}
// currBlockNo++;
getAndMoveToFrontDecode();
mCrc.initialiseCRC();
currentState = START_BLOCK_STATE;
}
private void endBlock() {
computedBlockCRC = mCrc.getFinalCRC();
/* A bad CRC is considered a fatal error. */
if (storedBlockCRC != computedBlockCRC) {
crcError();
}
computedCombinedCRC = (computedCombinedCRC << 1)
| (computedCombinedCRC >>> 31);
computedCombinedCRC ^= computedBlockCRC;
}
private void complete() {
storedCombinedCRC = bsGetInt32();
if (storedCombinedCRC != computedCombinedCRC) {
crcError();
}
bsFinishedWithStream();
streamEnd = true;
}
private static void blockOverrun() {
cadvise();
}
private static void badBlockHeader() {
cadvise();
}
private static void crcError() {
cadvise();
}
private void bsFinishedWithStream() {
try {
if (this.bsStream != null) {
if (this.bsStream != System.in) {
this.bsStream.close();
this.bsStream = null;
}
}
} catch (IOException ioe) {
//ignore
}
}
private void bsSetStream(InputStream f) {
bsStream = f;
bsLive = 0;
bsBuff = 0;
}
private int bsR(int n) {
int v;
while (bsLive < n) {
int zzi;
char thech = 0;
try {
thech = (char) bsStream.read();
} catch (IOException e) {
compressedStreamEOF();
}
if (thech == -1) {
compressedStreamEOF();
}
zzi = thech;
bsBuff = (bsBuff << 8) | (zzi & 0xff);
bsLive += 8;
}
v = (bsBuff >> (bsLive - n)) & ((1 << n) - 1);
bsLive -= n;
return v;
}
private char bsGetUChar() {
return (char) bsR(8);
}
private int bsGetint() {
int u = 0;
u = (u << 8) | bsR(8);
u = (u << 8) | bsR(8);
u = (u << 8) | bsR(8);
u = (u << 8) | bsR(8);
return u;
}
private int bsGetIntVS(int numBits) {
return (int) bsR(numBits);
}
private int bsGetInt32() {
return (int) bsGetint();
}
private void hbCreateDecodeTables(int[] limit, int[] base,
int[] perm, char[] length,
int minLen, int maxLen, int alphaSize) {
int pp, i, j, vec;
pp = 0;
for (i = minLen; i <= maxLen; i++) {
for (j = 0; j < alphaSize; j++) {
if (length[j] == i) {
perm[pp] = j;
pp++;
}
}
}
for (i = 0; i < MAX_CODE_LEN; i++) {
base[i] = 0;
}
for (i = 0; i < alphaSize; i++) {
base[length[i] + 1]++;
}
for (i = 1; i < MAX_CODE_LEN; i++) {
base[i] += base[i - 1];
}
for (i = 0; i < MAX_CODE_LEN; i++) {
limit[i] = 0;
}
vec = 0;
for (i = minLen; i <= maxLen; i++) {
vec += (base[i + 1] - base[i]);
limit[i] = vec - 1;
vec <<= 1;
}
for (i = minLen + 1; i <= maxLen; i++) {
base[i] = ((limit[i - 1] + 1) << 1) - base[i];
}
}
private void recvDecodingTables() {
char len[][] = new char[N_GROUPS][MAX_ALPHA_SIZE];
int i, j, t, nGroups, nSelectors, alphaSize;
int minLen, maxLen;
boolean[] inUse16 = new boolean[16];
/* Receive the mapping table */
for (i = 0; i < 16; i++) {
if (bsR(1) == 1) {
inUse16[i] = true;
} else {
inUse16[i] = false;
}
}
for (i = 0; i < 256; i++) {
inUse[i] = false;
}
for (i = 0; i < 16; i++) {
if (inUse16[i]) {
for (j = 0; j < 16; j++) {
if (bsR(1) == 1) {
inUse[i * 16 + j] = true;
}
}
}
}
makeMaps();
alphaSize = nInUse + 2;
/* Now the selectors */
nGroups = bsR(3);
nSelectors = bsR(15);
for (i = 0; i < nSelectors; i++) {
j = 0;
while (bsR(1) == 1) {
j++;
}
selectorMtf[i] = (char) j;
}
/* Undo the MTF values for the selectors. */
{
char[] pos = new char[N_GROUPS];
char tmp, v;
for (v = 0; v < nGroups; v++) {
pos[v] = v;
}
for (i = 0; i < nSelectors; i++) {
v = selectorMtf[i];
tmp = pos[v];
while (v > 0) {
pos[v] = pos[v - 1];
v--;
}
pos[0] = tmp;
selector[i] = tmp;
}
}
/* Now the coding tables */
for (t = 0; t < nGroups; t++) {
int curr = bsR(5);
for (i = 0; i < alphaSize; i++) {
while (bsR(1) == 1) {
if (bsR(1) == 0) {
curr++;
} else {
curr--;
}
}
len[t][i] = (char) curr;
}
}
/* Create the Huffman decoding tables */
for (t = 0; t < nGroups; t++) {
minLen = 32;
maxLen = 0;
for (i = 0; i < alphaSize; i++) {
if (len[t][i] > maxLen) {
maxLen = len[t][i];
}
if (len[t][i] < minLen) {
minLen = len[t][i];
}
}
hbCreateDecodeTables(limit[t], base[t], perm[t], len[t], minLen,
maxLen, alphaSize);
minLens[t] = minLen;
}
}
private void getAndMoveToFrontDecode() {
char[] yy = new char[256];
int i, j, nextSym, limitLast;
int EOB, groupNo, groupPos;
limitLast = baseBlockSize * blockSize100k;
origPtr = bsGetIntVS(24);
recvDecodingTables();
EOB = nInUse + 1;
groupNo = -1;
groupPos = 0;
/*
Setting up the unzftab entries here is not strictly
necessary, but it does save having to do it later
in a separate pass, and so saves a block's worth of
cache misses.
*/
for (i = 0; i <= 255; i++) {
unzftab[i] = 0;
}
for (i = 0; i <= 255; i++) {
yy[i] = (char) i;
}
last = -1;
{
int zt, zn, zvec, zj;
if (groupPos == 0) {
groupNo++;
groupPos = G_SIZE;
}
groupPos--;
zt = selector[groupNo];
zn = minLens[zt];
zvec = bsR(zn);
while (zvec > limit[zt][zn]) {
zn++;
{
{
while (bsLive < 1) {
int zzi;
char thech = 0;
try {
thech = (char) bsStream.read();
} catch (IOException e) {
compressedStreamEOF();
}
if (thech == -1) {
compressedStreamEOF();
}
zzi = thech;
bsBuff = (bsBuff << 8) | (zzi & 0xff);
bsLive += 8;
}
}
zj = (bsBuff >> (bsLive - 1)) & 1;
bsLive--;
}
zvec = (zvec << 1) | zj;
}
nextSym = perm[zt][zvec - base[zt][zn]];
}
while (true) {
if (nextSym == EOB) {
break;
}
if (nextSym == RUNA || nextSym == RUNB) {
char ch;
int s = -1;
int N = 1;
do {
if (nextSym == RUNA) {
s = s + (0 + 1) * N;
} else if (nextSym == RUNB) {
s = s + (1 + 1) * N;
}
N = N * 2;
{
int zt, zn, zvec, zj;
if (groupPos == 0) {
groupNo++;
groupPos = G_SIZE;
}
groupPos--;
zt = selector[groupNo];
zn = minLens[zt];
zvec = bsR(zn);
while (zvec > limit[zt][zn]) {
zn++;
{
{
while (bsLive < 1) {
int zzi;
char thech = 0;
try {
thech = (char) bsStream.read();
} catch (IOException e) {
compressedStreamEOF();
}
if (thech == -1) {
compressedStreamEOF();
}
zzi = thech;
bsBuff = (bsBuff << 8) | (zzi & 0xff);
bsLive += 8;
}
}
zj = (bsBuff >> (bsLive - 1)) & 1;
bsLive--;
}
zvec = (zvec << 1) | zj;
}
nextSym = perm[zt][zvec - base[zt][zn]];
}
} while (nextSym == RUNA || nextSym == RUNB);
s++;
ch = seqToUnseq[yy[0]];
unzftab[ch] += s;
while (s > 0) {
last++;
ll8[last] = ch;
s--;
}
if (last >= limitLast) {
blockOverrun();
}
continue;
} else {
char tmp;
last++;
if (last >= limitLast) {
blockOverrun();
}
tmp = yy[nextSym - 1];
unzftab[seqToUnseq[tmp]]++;
ll8[last] = seqToUnseq[tmp];
/*
This loop is hammered during decompression,
hence the unrolling.
for (j = nextSym-1; j > 0; j--) yy[j] = yy[j-1];
*/
j = nextSym - 1;
for (; j > 3; j -= 4) {
yy[j] = yy[j - 1];
yy[j - 1] = yy[j - 2];
yy[j - 2] = yy[j - 3];
yy[j - 3] = yy[j - 4];
}
for (; j > 0; j--) {
yy[j] = yy[j - 1];
}
yy[0] = tmp;
{
int zt, zn, zvec, zj;
if (groupPos == 0) {
groupNo++;
groupPos = G_SIZE;
}
groupPos--;
zt = selector[groupNo];
zn = minLens[zt];
zvec = bsR(zn);
while (zvec > limit[zt][zn]) {
zn++;
{
{
while (bsLive < 1) {
int zzi;
char thech = 0;
try {
thech = (char) bsStream.read();
} catch (IOException e) {
compressedStreamEOF();
}
zzi = thech;
bsBuff = (bsBuff << 8) | (zzi & 0xff);
bsLive += 8;
}
}
zj = (bsBuff >> (bsLive - 1)) & 1;
bsLive--;
}
zvec = (zvec << 1) | zj;
}
nextSym = perm[zt][zvec - base[zt][zn]];
}
continue;
}
}
}
private void setupBlock() {
int[] cftab = new int[257];
char ch;
cftab[0] = 0;
for (i = 1; i <= 256; i++) {
cftab[i] = unzftab[i - 1];
}
for (i = 1; i <= 256; i++) {
cftab[i] += cftab[i - 1];
}
for (i = 0; i <= last; i++) {
ch = (char) ll8[i];
tt[cftab[ch]] = i;
cftab[ch]++;
}
cftab = null;
tPos = tt[origPtr];
count = 0;
i2 = 0;
ch2 = 256; /* not a char and not EOF */
if (blockRandomised) {
rNToGo = 0;
rTPos = 0;
setupRandPartA();
} else {
setupNoRandPartA();
}
}
private void setupRandPartA() {
if (i2 <= last) {
chPrev = ch2;
ch2 = ll8[tPos];
tPos = tt[tPos];
if (rNToGo == 0) {
rNToGo = rNums[rTPos];
rTPos++;
if (rTPos == 512) {
rTPos = 0;
}
}
rNToGo--;
ch2 ^= (int) ((rNToGo == 1) ? 1 : 0);
i2++;
currentChar = ch2;
currentState = RAND_PART_B_STATE;
mCrc.updateCRC(ch2);
} else {
endBlock();
initBlock();
setupBlock();
}
}
private void setupNoRandPartA() {
if (i2 <= last) {
chPrev = ch2;
ch2 = ll8[tPos];
tPos = tt[tPos];
i2++;
currentChar = ch2;
currentState = NO_RAND_PART_B_STATE;
mCrc.updateCRC(ch2);
} else {
endBlock();
initBlock();
setupBlock();
}
}
private void setupRandPartB() {
if (ch2 != chPrev) {
currentState = RAND_PART_A_STATE;
count = 1;
setupRandPartA();
} else {
count++;
if (count >= 4) {
z = ll8[tPos];
tPos = tt[tPos];
if (rNToGo == 0) {
rNToGo = rNums[rTPos];
rTPos++;
if (rTPos == 512) {
rTPos = 0;
}
}
rNToGo--;
z ^= ((rNToGo == 1) ? 1 : 0);
j2 = 0;
currentState = RAND_PART_C_STATE;
setupRandPartC();
} else {
currentState = RAND_PART_A_STATE;
setupRandPartA();
}
}
}
private void setupRandPartC() {
if (j2 < (int) z) {
currentChar = ch2;
mCrc.updateCRC(ch2);
j2++;
} else {
currentState = RAND_PART_A_STATE;
i2++;
count = 0;
setupRandPartA();
}
}
private void setupNoRandPartB() {
if (ch2 != chPrev) {
currentState = NO_RAND_PART_A_STATE;
count = 1;
setupNoRandPartA();
} else {
count++;
if (count >= 4) {
z = ll8[tPos];
tPos = tt[tPos];
currentState = NO_RAND_PART_C_STATE;
j2 = 0;
setupNoRandPartC();
} else {
currentState = NO_RAND_PART_A_STATE;
setupNoRandPartA();
}
}
}
private void setupNoRandPartC() {
if (j2 < (int) z) {
currentChar = ch2;
mCrc.updateCRC(ch2);
j2++;
} else {
currentState = NO_RAND_PART_A_STATE;
i2++;
count = 0;
setupNoRandPartA();
}
}
private void setDecompressStructureSizes(int newSize100k) {
if (!(0 <= newSize100k && newSize100k <= 9 && 0 <= blockSize100k
&& blockSize100k <= 9)) {
// throw new IOException("Invalid block size");
}
blockSize100k = newSize100k;
if (newSize100k == 0) {
return;
}
int n = baseBlockSize * newSize100k;
ll8 = new char[n];
tt = new int[n];
}
}

File diff suppressed because it is too large Load Diff

View File

@ -1,167 +0,0 @@
/*
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001-2002 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution, if
* any, must include the following acknowlegement:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowlegement may appear in the software itself,
* if and wherever such third-party acknowlegements normally appear.
*
* 4. The names "Ant" and "Apache Software
* Foundation" must not be used to endorse or promote products derived
* from this software without prior written permission. For written
* permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache"
* nor may "Apache" appear in their names without prior written
* permission of the Apache Group.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
/*
* This package is based on the work done by Keiron Liddle, Aftex Software
* <keiron@aftexsw.com> to whom the Ant project is very grateful for his
* great code.
*/
package org.apache.tools.bzip2;
/**
* A simple class the hold and calculate the CRC for sanity checking
* of the data.
*
* @author <a href="mailto:keiron@aftexsw.com">Keiron Liddle</a>
*/
class CRC {
public static int crc32Table[] = {
0x00000000, 0x04c11db7, 0x09823b6e, 0x0d4326d9,
0x130476dc, 0x17c56b6b, 0x1a864db2, 0x1e475005,
0x2608edb8, 0x22c9f00f, 0x2f8ad6d6, 0x2b4bcb61,
0x350c9b64, 0x31cd86d3, 0x3c8ea00a, 0x384fbdbd,
0x4c11db70, 0x48d0c6c7, 0x4593e01e, 0x4152fda9,
0x5f15adac, 0x5bd4b01b, 0x569796c2, 0x52568b75,
0x6a1936c8, 0x6ed82b7f, 0x639b0da6, 0x675a1011,
0x791d4014, 0x7ddc5da3, 0x709f7b7a, 0x745e66cd,
0x9823b6e0, 0x9ce2ab57, 0x91a18d8e, 0x95609039,
0x8b27c03c, 0x8fe6dd8b, 0x82a5fb52, 0x8664e6e5,
0xbe2b5b58, 0xbaea46ef, 0xb7a96036, 0xb3687d81,
0xad2f2d84, 0xa9ee3033, 0xa4ad16ea, 0xa06c0b5d,
0xd4326d90, 0xd0f37027, 0xddb056fe, 0xd9714b49,
0xc7361b4c, 0xc3f706fb, 0xceb42022, 0xca753d95,
0xf23a8028, 0xf6fb9d9f, 0xfbb8bb46, 0xff79a6f1,
0xe13ef6f4, 0xe5ffeb43, 0xe8bccd9a, 0xec7dd02d,
0x34867077, 0x30476dc0, 0x3d044b19, 0x39c556ae,
0x278206ab, 0x23431b1c, 0x2e003dc5, 0x2ac12072,
0x128e9dcf, 0x164f8078, 0x1b0ca6a1, 0x1fcdbb16,
0x018aeb13, 0x054bf6a4, 0x0808d07d, 0x0cc9cdca,
0x7897ab07, 0x7c56b6b0, 0x71159069, 0x75d48dde,
0x6b93dddb, 0x6f52c06c, 0x6211e6b5, 0x66d0fb02,
0x5e9f46bf, 0x5a5e5b08, 0x571d7dd1, 0x53dc6066,
0x4d9b3063, 0x495a2dd4, 0x44190b0d, 0x40d816ba,
0xaca5c697, 0xa864db20, 0xa527fdf9, 0xa1e6e04e,
0xbfa1b04b, 0xbb60adfc, 0xb6238b25, 0xb2e29692,
0x8aad2b2f, 0x8e6c3698, 0x832f1041, 0x87ee0df6,
0x99a95df3, 0x9d684044, 0x902b669d, 0x94ea7b2a,
0xe0b41de7, 0xe4750050, 0xe9362689, 0xedf73b3e,
0xf3b06b3b, 0xf771768c, 0xfa325055, 0xfef34de2,
0xc6bcf05f, 0xc27dede8, 0xcf3ecb31, 0xcbffd686,
0xd5b88683, 0xd1799b34, 0xdc3abded, 0xd8fba05a,
0x690ce0ee, 0x6dcdfd59, 0x608edb80, 0x644fc637,
0x7a089632, 0x7ec98b85, 0x738aad5c, 0x774bb0eb,
0x4f040d56, 0x4bc510e1, 0x46863638, 0x42472b8f,
0x5c007b8a, 0x58c1663d, 0x558240e4, 0x51435d53,
0x251d3b9e, 0x21dc2629, 0x2c9f00f0, 0x285e1d47,
0x36194d42, 0x32d850f5, 0x3f9b762c, 0x3b5a6b9b,
0x0315d626, 0x07d4cb91, 0x0a97ed48, 0x0e56f0ff,
0x1011a0fa, 0x14d0bd4d, 0x19939b94, 0x1d528623,
0xf12f560e, 0xf5ee4bb9, 0xf8ad6d60, 0xfc6c70d7,
0xe22b20d2, 0xe6ea3d65, 0xeba91bbc, 0xef68060b,
0xd727bbb6, 0xd3e6a601, 0xdea580d8, 0xda649d6f,
0xc423cd6a, 0xc0e2d0dd, 0xcda1f604, 0xc960ebb3,
0xbd3e8d7e, 0xb9ff90c9, 0xb4bcb610, 0xb07daba7,
0xae3afba2, 0xaafbe615, 0xa7b8c0cc, 0xa379dd7b,
0x9b3660c6, 0x9ff77d71, 0x92b45ba8, 0x9675461f,
0x8832161a, 0x8cf30bad, 0x81b02d74, 0x857130c3,
0x5d8a9099, 0x594b8d2e, 0x5408abf7, 0x50c9b640,
0x4e8ee645, 0x4a4ffbf2, 0x470cdd2b, 0x43cdc09c,
0x7b827d21, 0x7f436096, 0x7200464f, 0x76c15bf8,
0x68860bfd, 0x6c47164a, 0x61043093, 0x65c52d24,
0x119b4be9, 0x155a565e, 0x18197087, 0x1cd86d30,
0x029f3d35, 0x065e2082, 0x0b1d065b, 0x0fdc1bec,
0x3793a651, 0x3352bbe6, 0x3e119d3f, 0x3ad08088,
0x2497d08d, 0x2056cd3a, 0x2d15ebe3, 0x29d4f654,
0xc5a92679, 0xc1683bce, 0xcc2b1d17, 0xc8ea00a0,
0xd6ad50a5, 0xd26c4d12, 0xdf2f6bcb, 0xdbee767c,
0xe3a1cbc1, 0xe760d676, 0xea23f0af, 0xeee2ed18,
0xf0a5bd1d, 0xf464a0aa, 0xf9278673, 0xfde69bc4,
0x89b8fd09, 0x8d79e0be, 0x803ac667, 0x84fbdbd0,
0x9abc8bd5, 0x9e7d9662, 0x933eb0bb, 0x97ffad0c,
0xafb010b1, 0xab710d06, 0xa6322bdf, 0xa2f33668,
0xbcb4666d, 0xb8757bda, 0xb5365d03, 0xb1f740b4
};
public CRC() {
initialiseCRC();
}
void initialiseCRC() {
globalCrc = 0xffffffff;
}
int getFinalCRC() {
return ~globalCrc;
}
int getGlobalCRC() {
return globalCrc;
}
void setGlobalCRC(int newCrc) {
globalCrc = newCrc;
}
void updateCRC(int inCh) {
int temp = (globalCrc >> 24) ^ inCh;
if (temp < 0) {
temp = 256 + temp;
}
globalCrc = (globalCrc << 8) ^ CRC.crc32Table[temp];
}
int globalCrc;
}