- save encoding and confidence in the project metadata
- use the saved encoding for decoding - don't error when fingerprinting null git-svn-id: http://google-refine.googlecode.com/svn/trunk@160 7d457c2a-affb-35e4-300a-418c747d4874
This commit is contained in:
parent
bc9bc54d30
commit
621655372f
@ -14,6 +14,8 @@ public class ProjectMetadata implements Serializable, Jsonizable {
|
|||||||
private final Date _created = new Date();
|
private final Date _created = new Date();
|
||||||
private String _name;
|
private String _name;
|
||||||
private String _password;
|
private String _password;
|
||||||
|
private String _encoding;
|
||||||
|
private int _encodingConfidence;
|
||||||
private Date _modified = new Date();
|
private Date _modified = new Date();
|
||||||
|
|
||||||
public Date getCreated() {
|
public Date getCreated() {
|
||||||
@ -28,6 +30,26 @@ public class ProjectMetadata implements Serializable, Jsonizable {
|
|||||||
return _name;
|
return _name;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void setEncoding(String encoding) {
|
||||||
|
this._encoding = encoding;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getEncoding() {
|
||||||
|
return _encoding;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setEncodingConfidence(int confidence) {
|
||||||
|
this._encodingConfidence = confidence;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setEncodingConfidence(String confidence) {
|
||||||
|
this.setEncodingConfidence(Integer.parseInt(confidence));
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getEncodingConfidence() {
|
||||||
|
return _encodingConfidence;
|
||||||
|
}
|
||||||
|
|
||||||
public void setPassword(String password) {
|
public void setPassword(String password) {
|
||||||
this._password = password;
|
this._password = password;
|
||||||
}
|
}
|
||||||
|
@ -49,6 +49,8 @@ public class CreateProjectCommand extends Command {
|
|||||||
ProjectMetadata pm = new ProjectMetadata();
|
ProjectMetadata pm = new ProjectMetadata();
|
||||||
pm.setName(options.getProperty("project-name"));
|
pm.setName(options.getProperty("project-name"));
|
||||||
pm.setPassword(options.getProperty("project-password"));
|
pm.setPassword(options.getProperty("project-password"));
|
||||||
|
pm.setEncoding(options.getProperty("encoding"));
|
||||||
|
pm.setEncodingConfidence(options.getProperty("encoding_confidence"));
|
||||||
ProjectManager.singleton.registerProject(project, pm);
|
ProjectManager.singleton.registerProject(project, pm);
|
||||||
|
|
||||||
project.columnModel.update();
|
project.columnModel.update();
|
||||||
@ -125,7 +127,10 @@ public class CreateProjectCommand extends Command {
|
|||||||
|
|
||||||
if (importer.takesReader()) {
|
if (importer.takesReader()) {
|
||||||
CharsetDetector detector = new CharsetDetector();
|
CharsetDetector detector = new CharsetDetector();
|
||||||
|
detector.setDeclaredEncoding("utf8"); // the content on the web is encoded in UTF-8 so assume that
|
||||||
CharsetMatch charsetMatch = detector.setText(enforceMarking(filePart.getInputStream())).detect();
|
CharsetMatch charsetMatch = detector.setText(enforceMarking(filePart.getInputStream())).detect();
|
||||||
|
options.setProperty("encoding", charsetMatch.getName());
|
||||||
|
options.setProperty("encoding_confidence", Integer.toString(charsetMatch.getConfidence()));
|
||||||
logger.info("Best encoding guess: " + charsetMatch.getName() + " [confidence: " + charsetMatch.getConfidence() + "]");
|
logger.info("Best encoding guess: " + charsetMatch.getName() + " [confidence: " + charsetMatch.getConfidence() + "]");
|
||||||
Reader reader = charsetMatch.getReader();
|
Reader reader = charsetMatch.getReader();
|
||||||
try {
|
try {
|
||||||
|
@ -8,8 +8,6 @@ import org.apache.commons.lang.StringUtils;
|
|||||||
import org.json.JSONException;
|
import org.json.JSONException;
|
||||||
import org.json.JSONWriter;
|
import org.json.JSONWriter;
|
||||||
|
|
||||||
import com.metaweb.gridworks.expr.EvalError;
|
|
||||||
import com.metaweb.gridworks.gel.ControlFunctionRegistry;
|
|
||||||
import com.metaweb.gridworks.gel.Function;
|
import com.metaweb.gridworks.gel.Function;
|
||||||
|
|
||||||
public class Fingerprint implements Function {
|
public class Fingerprint implements Function {
|
||||||
@ -28,7 +26,7 @@ public class Fingerprint implements Function {
|
|||||||
return StringUtils.join(frags," "); // rejoin them with a single space between them
|
return StringUtils.join(frags," "); // rejoin them with a single space between them
|
||||||
|
|
||||||
}
|
}
|
||||||
return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " expects a string");
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void write(JSONWriter writer, Properties options)
|
public void write(JSONWriter writer, Properties options)
|
||||||
|
@ -6,9 +6,12 @@ import java.util.Properties;
|
|||||||
import org.json.JSONException;
|
import org.json.JSONException;
|
||||||
import org.json.JSONWriter;
|
import org.json.JSONWriter;
|
||||||
|
|
||||||
|
import com.metaweb.gridworks.ProjectManager;
|
||||||
|
import com.metaweb.gridworks.ProjectMetadata;
|
||||||
import com.metaweb.gridworks.expr.EvalError;
|
import com.metaweb.gridworks.expr.EvalError;
|
||||||
import com.metaweb.gridworks.gel.ControlFunctionRegistry;
|
import com.metaweb.gridworks.gel.ControlFunctionRegistry;
|
||||||
import com.metaweb.gridworks.gel.Function;
|
import com.metaweb.gridworks.gel.Function;
|
||||||
|
import com.metaweb.gridworks.model.Project;
|
||||||
|
|
||||||
public class Reinterpret implements Function {
|
public class Reinterpret implements Function {
|
||||||
|
|
||||||
@ -18,13 +21,16 @@ public class Reinterpret implements Function {
|
|||||||
Object o2 = args[1];
|
Object o2 = args[1];
|
||||||
if (o1 != null && o2 != null && o2 instanceof String) {
|
if (o1 != null && o2 != null && o2 instanceof String) {
|
||||||
String str = (o1 instanceof String) ? (String) o1 : o1.toString();
|
String str = (o1 instanceof String) ? (String) o1 : o1.toString();
|
||||||
String decoder = (String) o2;
|
Project project = (Project) bindings.get("project");
|
||||||
|
ProjectMetadata metadata = ProjectManager.singleton.getProjectMetadata(project.id);
|
||||||
|
String decoder = (String) metadata.getEncoding();
|
||||||
|
String encoder = (String) o2;
|
||||||
String reinterpreted = null;
|
String reinterpreted = null;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
reinterpreted = new String(str.getBytes(decoder), "UTF8");
|
reinterpreted = new String(str.getBytes(decoder), encoder);
|
||||||
} catch (UnsupportedEncodingException e) {
|
} catch (UnsupportedEncodingException e) {
|
||||||
return new EvalError(ControlFunctionRegistry.getFunctionName(this) + ": encoding '" + decoder + "' is not available or recognized.");
|
return new EvalError(ControlFunctionRegistry.getFunctionName(this) + ": encoding '" + encoder + "' is not available or recognized.");
|
||||||
}
|
}
|
||||||
|
|
||||||
return reinterpreted;
|
return reinterpreted;
|
||||||
|
Loading…
Reference in New Issue
Block a user