- save encoding and confidence in the project metadata

- use the saved encoding for decoding
- don't error when fingerprinting null


git-svn-id: http://google-refine.googlecode.com/svn/trunk@160 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
Stefano Mazzocchi 2010-03-01 04:56:16 +00:00
parent bc9bc54d30
commit 621655372f
4 changed files with 37 additions and 6 deletions

View File

@ -14,6 +14,8 @@ public class ProjectMetadata implements Serializable, Jsonizable {
private final Date _created = new Date();
private String _name;
private String _password;
private String _encoding;
private int _encodingConfidence;
private Date _modified = new Date();
public Date getCreated() {
@ -28,6 +30,26 @@ public class ProjectMetadata implements Serializable, Jsonizable {
return _name;
}
public void setEncoding(String encoding) {
this._encoding = encoding;
}
public String getEncoding() {
return _encoding;
}
public void setEncodingConfidence(int confidence) {
this._encodingConfidence = confidence;
}
public void setEncodingConfidence(String confidence) {
this.setEncodingConfidence(Integer.parseInt(confidence));
}
public int getEncodingConfidence() {
return _encodingConfidence;
}
public void setPassword(String password) {
this._password = password;
}

View File

@ -49,6 +49,8 @@ public class CreateProjectCommand extends Command {
ProjectMetadata pm = new ProjectMetadata();
pm.setName(options.getProperty("project-name"));
pm.setPassword(options.getProperty("project-password"));
pm.setEncoding(options.getProperty("encoding"));
pm.setEncodingConfidence(options.getProperty("encoding_confidence"));
ProjectManager.singleton.registerProject(project, pm);
project.columnModel.update();
@ -125,7 +127,10 @@ public class CreateProjectCommand extends Command {
if (importer.takesReader()) {
CharsetDetector detector = new CharsetDetector();
detector.setDeclaredEncoding("utf8"); // the content on the web is encoded in UTF-8 so assume that
CharsetMatch charsetMatch = detector.setText(enforceMarking(filePart.getInputStream())).detect();
options.setProperty("encoding", charsetMatch.getName());
options.setProperty("encoding_confidence", Integer.toString(charsetMatch.getConfidence()));
logger.info("Best encoding guess: " + charsetMatch.getName() + " [confidence: " + charsetMatch.getConfidence() + "]");
Reader reader = charsetMatch.getReader();
try {

View File

@ -8,8 +8,6 @@ import org.apache.commons.lang.StringUtils;
import org.json.JSONException;
import org.json.JSONWriter;
import com.metaweb.gridworks.expr.EvalError;
import com.metaweb.gridworks.gel.ControlFunctionRegistry;
import com.metaweb.gridworks.gel.Function;
public class Fingerprint implements Function {
@ -28,7 +26,7 @@ public class Fingerprint implements Function {
return StringUtils.join(frags," "); // rejoin them with a single space between them
}
return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " expects a string");
return null;
}
public void write(JSONWriter writer, Properties options)

View File

@ -6,9 +6,12 @@ import java.util.Properties;
import org.json.JSONException;
import org.json.JSONWriter;
import com.metaweb.gridworks.ProjectManager;
import com.metaweb.gridworks.ProjectMetadata;
import com.metaweb.gridworks.expr.EvalError;
import com.metaweb.gridworks.gel.ControlFunctionRegistry;
import com.metaweb.gridworks.gel.Function;
import com.metaweb.gridworks.model.Project;
public class Reinterpret implements Function {
@ -18,13 +21,16 @@ public class Reinterpret implements Function {
Object o2 = args[1];
if (o1 != null && o2 != null && o2 instanceof String) {
String str = (o1 instanceof String) ? (String) o1 : o1.toString();
String decoder = (String) o2;
Project project = (Project) bindings.get("project");
ProjectMetadata metadata = ProjectManager.singleton.getProjectMetadata(project.id);
String decoder = (String) metadata.getEncoding();
String encoder = (String) o2;
String reinterpreted = null;
try {
reinterpreted = new String(str.getBytes(decoder), "UTF8");
reinterpreted = new String(str.getBytes(decoder), encoder);
} catch (UnsupportedEncodingException e) {
return new EvalError(ControlFunctionRegistry.getFunctionName(this) + ": encoding '" + decoder + "' is not available or recognized.");
return new EvalError(ControlFunctionRegistry.getFunctionName(this) + ": encoding '" + encoder + "' is not available or recognized.");
}
return reinterpreted;