- save encoding and confidence in the project metadata
- use the saved encoding for decoding - don't error when fingerprinting null git-svn-id: http://google-refine.googlecode.com/svn/trunk@160 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
parent
bc9bc54d30
commit
621655372f
@ -14,6 +14,8 @@ public class ProjectMetadata implements Serializable, Jsonizable {
|
||||
private final Date _created = new Date();
|
||||
private String _name;
|
||||
private String _password;
|
||||
private String _encoding;
|
||||
private int _encodingConfidence;
|
||||
private Date _modified = new Date();
|
||||
|
||||
public Date getCreated() {
|
||||
@ -28,6 +30,26 @@ public class ProjectMetadata implements Serializable, Jsonizable {
|
||||
return _name;
|
||||
}
|
||||
|
||||
public void setEncoding(String encoding) {
|
||||
this._encoding = encoding;
|
||||
}
|
||||
|
||||
public String getEncoding() {
|
||||
return _encoding;
|
||||
}
|
||||
|
||||
public void setEncodingConfidence(int confidence) {
|
||||
this._encodingConfidence = confidence;
|
||||
}
|
||||
|
||||
public void setEncodingConfidence(String confidence) {
|
||||
this.setEncodingConfidence(Integer.parseInt(confidence));
|
||||
}
|
||||
|
||||
public int getEncodingConfidence() {
|
||||
return _encodingConfidence;
|
||||
}
|
||||
|
||||
public void setPassword(String password) {
|
||||
this._password = password;
|
||||
}
|
||||
|
@ -49,6 +49,8 @@ public class CreateProjectCommand extends Command {
|
||||
ProjectMetadata pm = new ProjectMetadata();
|
||||
pm.setName(options.getProperty("project-name"));
|
||||
pm.setPassword(options.getProperty("project-password"));
|
||||
pm.setEncoding(options.getProperty("encoding"));
|
||||
pm.setEncodingConfidence(options.getProperty("encoding_confidence"));
|
||||
ProjectManager.singleton.registerProject(project, pm);
|
||||
|
||||
project.columnModel.update();
|
||||
@ -125,7 +127,10 @@ public class CreateProjectCommand extends Command {
|
||||
|
||||
if (importer.takesReader()) {
|
||||
CharsetDetector detector = new CharsetDetector();
|
||||
detector.setDeclaredEncoding("utf8"); // the content on the web is encoded in UTF-8 so assume that
|
||||
CharsetMatch charsetMatch = detector.setText(enforceMarking(filePart.getInputStream())).detect();
|
||||
options.setProperty("encoding", charsetMatch.getName());
|
||||
options.setProperty("encoding_confidence", Integer.toString(charsetMatch.getConfidence()));
|
||||
logger.info("Best encoding guess: " + charsetMatch.getName() + " [confidence: " + charsetMatch.getConfidence() + "]");
|
||||
Reader reader = charsetMatch.getReader();
|
||||
try {
|
||||
|
@ -8,8 +8,6 @@ import org.apache.commons.lang.StringUtils;
|
||||
import org.json.JSONException;
|
||||
import org.json.JSONWriter;
|
||||
|
||||
import com.metaweb.gridworks.expr.EvalError;
|
||||
import com.metaweb.gridworks.gel.ControlFunctionRegistry;
|
||||
import com.metaweb.gridworks.gel.Function;
|
||||
|
||||
public class Fingerprint implements Function {
|
||||
@ -28,7 +26,7 @@ public class Fingerprint implements Function {
|
||||
return StringUtils.join(frags," "); // rejoin them with a single space between them
|
||||
|
||||
}
|
||||
return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " expects a string");
|
||||
return null;
|
||||
}
|
||||
|
||||
public void write(JSONWriter writer, Properties options)
|
||||
|
@ -6,9 +6,12 @@ import java.util.Properties;
|
||||
import org.json.JSONException;
|
||||
import org.json.JSONWriter;
|
||||
|
||||
import com.metaweb.gridworks.ProjectManager;
|
||||
import com.metaweb.gridworks.ProjectMetadata;
|
||||
import com.metaweb.gridworks.expr.EvalError;
|
||||
import com.metaweb.gridworks.gel.ControlFunctionRegistry;
|
||||
import com.metaweb.gridworks.gel.Function;
|
||||
import com.metaweb.gridworks.model.Project;
|
||||
|
||||
public class Reinterpret implements Function {
|
||||
|
||||
@ -18,13 +21,16 @@ public class Reinterpret implements Function {
|
||||
Object o2 = args[1];
|
||||
if (o1 != null && o2 != null && o2 instanceof String) {
|
||||
String str = (o1 instanceof String) ? (String) o1 : o1.toString();
|
||||
String decoder = (String) o2;
|
||||
Project project = (Project) bindings.get("project");
|
||||
ProjectMetadata metadata = ProjectManager.singleton.getProjectMetadata(project.id);
|
||||
String decoder = (String) metadata.getEncoding();
|
||||
String encoder = (String) o2;
|
||||
String reinterpreted = null;
|
||||
|
||||
try {
|
||||
reinterpreted = new String(str.getBytes(decoder), "UTF8");
|
||||
reinterpreted = new String(str.getBytes(decoder), encoder);
|
||||
} catch (UnsupportedEncodingException e) {
|
||||
return new EvalError(ControlFunctionRegistry.getFunctionName(this) + ": encoding '" + decoder + "' is not available or recognized.");
|
||||
return new EvalError(ControlFunctionRegistry.getFunctionName(this) + ": encoding '" + encoder + "' is not available or recognized.");
|
||||
}
|
||||
|
||||
return reinterpreted;
|
||||
|
Loading…
Reference in New Issue
Block a user