RandomSec/main/src/com/google/refine/importers/FixedWidthImporter.java
Jacky c4b0ff6bea data package metadata (#1398)
* fix the appbundle issue #1209

* fix #1162

allow the JRE 9

* fix the package declarations

* remove the _ from the method name

* use the explicit scoping

* remote extra ;

* fix issued from codacy

* fix issued from codacy

* add preferences link to the index page

* handle the empty user metadata

* fix 'last modified' sorting issue #1307

* prevent overflow of the table. issue #1306

* add isoDateParser to sort the date

* prevent overflow of the project index

* remove sorter arrow for action columns

* disable editing the internal metadata

* adjust the width of the table

* change MetaData to Metadata

* change the filed name from rowNumber to rowCount

* put back the incidently deleted gitignore

* add double quote to prevent word splitting

* UI improvement on metadata view and project list view

* remove the date field in metadata

* message notification of the free RAM. Issue #1295

* UI tuning for metadata view

* shorten the ISO date to locale date format

* Added translation using Weblate (Portuguese (Brazil))

* remove the rename link

* Ignore empty language files introduced by Weblate

* Add UI for Invert text filter

* Backend support for Inverting Text search facets

* Fix reset on text search facet

* More succinct return statements

* add tests for SetProjectMetadataCommand

* Tidying up for Codacy

* Added Tests for TextSearchFilter

* Corrections for Codacy

* More code tidy up

* let the browser auto fit the table cell when resizing/zooming

* fix import multiple excel with mulitple sheets issue #1328

* check if the project has the userMetadata

* fix the unit test
support multi files with multi tables for open office

* prevent the same key for user metadata

* replace _ with variable for exception

* fix the no-undef issue

* to adjust the width of transform dialog. issue #1332

* fix the row count refresh issue

* extract method

* move the log message

* cosmatic changes for codacy

* fix typo

* bump to version 2.8

* .gitignore is now working

* preview stage won't have the metadata populated, so protect NPE

* Update README.md

No more direct link to the last version tag, which will avoid having to think of updating the readme

* refacotring the ProjectMetadata class

* introduce the IMetadata interface

* create submodule of dataschema

* add back

* setup lib for dataschema; upgrade the apache lang to lang3

* replace escape* functions from apache lang3

* replace the ProjectMetadata with IMetadata interface

* add missing jars

* set the IMetadata a field of Project

* remove PreferenceStore out of Project model

* fix test SetProjectMetadataCommandTests by casting

* introdcue the AbstractMetadata

* introdcue the AbstractMetadata

* reorganize the metadata package

* allow have mulitiple metadata for a project

* support for mulitple metadata format

* remove jdk7 since 'table schema' java implmentation only support jdk8+

* set execute permission for script

* fix the Unit Test after Metadata refactoring

* restore the apache lang2.5 since jetty 6.1.22 depend on it

* add commons lang 2.5 jar

* git submodule add  https://github.com/frictionlessdata/datapackage-java

* remove the metadata parameter from the ProjectManager.registerProject method

* remove hashmap _projectsMetadata field from the ProjectManager and FileProjectManager

* init the Project.metadataMap

* fix Unit Test

* restore the ProjectMetaData map to ProjectManager

* put the ProjectMetaDta in place for ProjectManager and Project object

* check null of singleton instead of create a constructor just for test

* load the data package metadata

* importing data package

* importing data package

* encapsulate the Package class into DataPackageMetadata

* user _ to indicate the class fields

* introduce base URL in order to download the data files

* import data package UI and draft backend

* import data package UI

* fix typo

* download the data set pointed from metadata resource

* save and load the data package metadata

* avoid magic string

* package cleanup

* set the java_version to 1.8

* set the min jdk to 1.8

* add the 3rd party src in the build.xml

* skip the file selection page if only 1 DATA file

* add files structure for json editor

* seperate out the metadata file from the retrival file list

* rename the OKF_METADATA to DATAPACKAGE_METADATA

* clean up

* implement GetMetadateCommand class

* display the metadata in json format

* git submodule update --remote --merge

* adjust the setting after pulling from datapackage origin

* fix the failed UT DateExtensionTests.testFetchCounts due to new json jar json-20160810.jar will complain: JSONObject["float"] not a string.

* clean up the weird loop array syntax get complained

* remove the unused constant

* export in data package format

* interface cleanup

* fix UT

* edit the metadata

* add UT for SetMetadataCommand

* fix UT for SetMetadataCommand

* display the data package metadata link on the project index page

* update submodule

* log the exceptions

* Ajv does not work properly, use the back end validation instead

* enable the validation for jsoneditor

* first draft of the data validation

* create a map to hold the constraint and its handler

* rename

* support for minLength and maxLength from spec

* add validate command

* test the opeation instead of validate command

* rename the UT

* format the error message and push to the report

* fix row number

* add resource bundle for validator

* inject the code of the constrains

* make the StrSubstitutor works

* extract the type and format information

* add the customizedFormat to interface to allow format properly

* get rid of magic string

* take care of missing parts of the data package

* implement RequiredConstraint

* patch for number type

* add max/min constraints

* get the constrains directly from field

* implement the PatternConstraint

* suppress warning

* fix the broken UT when expecting 2 digits fraction

* handle the cast and type properly

* fix the missing resource files for data package when run from command line

* use the copy instead of copydir

* add script for appveyor

* update script for appveyor

* do recursive clone

* correct the git url

* fix clone path

* clone folder option does not work

* will put another PR for this. delete for now

* revert the interface method name

* lazy loading the project data

* disable the validate menu for now

* add UT

* assert UTs

* add UT

* fix #1386

* remove import

* test the thread

* Revert "test the thread"

This reverts commit 779214160055afe3ccdcc18c57b0c7c72e87c824.

* fix the URLCachingTest UT

* define the template data package

* tidy up the metadata interface

* check the http response code

* fix the package

* display user friendly message when URL path is not reachable

* populate the data package schema

* Delete hs_err_pid15194.log

* populate data package info

* add username  preference and it will be pulled as the creator of the metadata

* undo the project.updateColumnChange() and start to introduce the fields into the existing core model

* tightly integrate the data package metadata

* tightly integrate the data package metadata for project level

* remove the submodule

* move the edit botton

* clean up build

* load the new property

* load the project metadata

* fix issues from codacy

* remove unused fields and annotation

* check the http response code firstly

* import zipped data package

* allow without keywords

* process the zip data package from url

* merge the tags

* check store firstly

* remove the table schema src

* move the json schema files to schema dir

* add comment

* add comment

* remove git moduels

* add incidently deleted file

* fix typo

* remove SetMetadataCommand

* revert change

* merge from master
2018-02-02 13:24:19 +00:00

227 lines
8.2 KiB
Java

package com.google.refine.importers;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
import org.json.JSONArray;
import org.json.JSONObject;
import com.google.refine.importing.ImportingJob;
import com.google.refine.importing.ImportingUtilities;
import com.google.refine.model.Project;
import com.google.refine.model.medadata.ProjectMetadata;
import com.google.refine.util.JSONUtilities;
public class FixedWidthImporter extends TabularImportingParserBase {
public FixedWidthImporter() {
super(false);
}
@Override
public JSONObject createParserUIInitializationData(
ImportingJob job, List<JSONObject> fileRecords, String format) {
JSONObject options = super.createParserUIInitializationData(job, fileRecords, format);
JSONArray columnWidths = new JSONArray();
if (fileRecords.size() > 0) {
JSONObject firstFileRecord = fileRecords.get(0);
String encoding = ImportingUtilities.getEncoding(firstFileRecord);
String location = JSONUtilities.getString(firstFileRecord, "location", null);
if (location != null) {
File file = new File(job.getRawDataDir(), location);
int[] columnWidthsA = guessColumnWidths(file, encoding);
if (columnWidthsA != null) {
for (int w : columnWidthsA) {
JSONUtilities.append(columnWidths, w);
}
}
}
JSONUtilities.safePut(options, "headerLines", 0);
JSONUtilities.safePut(options, "columnWidths", columnWidths);
JSONUtilities.safePut(options, "guessCellValueTypes", false);
}
return options;
}
@Override
public void parseOneFile(
Project project,
ProjectMetadata metadata,
ImportingJob job,
String fileSource,
Reader reader,
int limit,
JSONObject options,
List<Exception> exceptions
) {
final int[] columnWidths = JSONUtilities.getIntArray(options, "columnWidths");
List<Object> retrievedColumnNames = null;
if (options.has("columnNames")) {
String[] strings = JSONUtilities.getStringArray(options, "columnNames");
if (strings.length > 0) {
retrievedColumnNames = new ArrayList<Object>();
for (String s : strings) {
s = s.trim();
if (!s.isEmpty()) {
retrievedColumnNames.add(s);
}
}
if (retrievedColumnNames.size() > 0) {
JSONUtilities.safePut(options, "headerLines", 1);
} else {
retrievedColumnNames = null;
}
}
}
final List<Object> columnNames = retrievedColumnNames;
final LineNumberReader lnReader = new LineNumberReader(reader);
TableDataReader dataReader = new TableDataReader() {
boolean usedColumnNames = false;
@Override
public List<Object> getNextRowOfCells() throws IOException {
if (columnNames != null && !usedColumnNames) {
usedColumnNames = true;
return columnNames;
} else {
String line = lnReader.readLine();
if (line == null) {
return null;
} else {
return getCells(line, columnWidths);
}
}
}
};
TabularImportingParserBase.readTable(project, metadata, job, dataReader, fileSource, limit, options, exceptions);
super.parseOneFile(project, metadata, job, fileSource, reader, limit, options, exceptions);
}
/**
* Splits the line into columns
* @param line Line to be split
* @param widths array of integers with field sizes
* @return
*/
static private ArrayList<Object> getCells(String line, int[] widths) {
ArrayList<Object> cells = new ArrayList<Object>();
int columnStartCursor = 0;
int columnEndCursor = 0;
for (int width : widths) {
if (columnStartCursor >= line.length()) {
cells.add(null); //FIXME is adding a null cell (to represent no data) OK?
continue;
}
columnEndCursor = columnStartCursor + width;
if (columnEndCursor > line.length()) {
columnEndCursor = line.length();
}
if (columnEndCursor <= columnStartCursor) {
cells.add(null); //FIXME is adding a null cell (to represent no data, or a zero width column) OK?
continue;
}
cells.add(line.substring(columnStartCursor, columnEndCursor));
columnStartCursor = columnEndCursor;
}
// Residual text
if (columnStartCursor < line.length()) {
cells.add(line.substring(columnStartCursor));
}
return cells;
}
static public int[] guessColumnWidths(File file, String encoding) {
try {
InputStream is = new FileInputStream(file);
Reader reader = (encoding != null) ? new InputStreamReader(is, encoding) : new InputStreamReader(is);
LineNumberReader lineNumberReader = new LineNumberReader(reader);
try {
int[] counts = null;
int totalBytes = 0;
int lineCount = 0;
String s;
while (totalBytes < 64 * 1024 &&
lineCount < 100 &&
(s = lineNumberReader.readLine()) != null) {
totalBytes += s.length() + 1; // count the new line character
if (s.length() == 0) {
continue;
}
lineCount++;
if (counts == null) {
counts = new int[s.length()];
for (int c = 0; c < counts.length; c++) {
counts[c] = 0;
}
}
for (int c = 0; c < counts.length && c < s.length(); c++) {
char ch = s.charAt(c);
if (ch == ' ') {
counts[c]++;
}
}
}
if (counts != null && lineCount > 2) {
List<Integer> widths = new ArrayList<Integer>();
int startIndex = 0;
for (int c = 0; c < counts.length; c++) {
int count = counts[c];
if (count == lineCount) {
widths.add(c - startIndex + 1);
startIndex = c + 1;
}
}
for (int i = widths.size() - 2; i >= 0; i--) {
if (widths.get(i) == 1) {
widths.set(i + 1, widths.get(i + 1) + 1);
widths.remove(i);
}
}
int[] widthA = new int[widths.size()];
for (int i = 0; i < widthA.length; i++) {
widthA[i] = widths.get(i);
}
return widthA;
}
} finally {
lineNumberReader.close();
reader.close();
is.close();
}
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return null;
}
}