RandomSec/main/src/com/google/refine/importers/FixedWidthImporter.java

package com.google.refine.importers;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;

import org.json.JSONArray;
import org.json.JSONObject;

import com.google.refine.importing.ImportingJob;
import com.google.refine.importing.ImportingUtilities;
import com.google.refine.model.Project;
import com.google.refine.model.medadata.ProjectMetadata;
import com.google.refine.util.JSONUtilities;

public class FixedWidthImporter extends TabularImportingParserBase {
    public FixedWidthImporter() {
        super(false);
    }
    
    @Override
    public JSONObject createParserUIInitializationData(
            ImportingJob job, List<JSONObject> fileRecords, String format) {
        JSONObject options = super.createParserUIInitializationData(job, fileRecords, format);
        JSONArray columnWidths = new JSONArray();
        if (fileRecords.size() > 0) {
            JSONObject firstFileRecord = fileRecords.get(0);
            String encoding = ImportingUtilities.getEncoding(firstFileRecord);
            String location = JSONUtilities.getString(firstFileRecord, "location", null);
            if (location != null) {
                File file = new File(job.getRawDataDir(), location);
                int[] columnWidthsA = guessColumnWidths(file, encoding);
                if (columnWidthsA != null) {
                    for (int w : columnWidthsA) {
                        JSONUtilities.append(columnWidths, w);
                    }
                }
            }

            JSONUtilities.safePut(options, "headerLines", 0);
            JSONUtilities.safePut(options, "columnWidths", columnWidths);
            JSONUtilities.safePut(options, "guessCellValueTypes", false);
        }
        return options;
    }

    @Override
    public void parseOneFile(
        Project project,
        ProjectMetadata metadata,
        ImportingJob job,
        String fileSource,
        Reader reader,
        int limit,
        JSONObject options,
        List<Exception> exceptions
    ) {
        final int[] columnWidths = JSONUtilities.getIntArray(options, "columnWidths");
        
        List<Object> retrievedColumnNames = null;
        if (options.has("columnNames")) {
            String[] strings = JSONUtilities.getStringArray(options, "columnNames");
            if (strings.length > 0) {
                retrievedColumnNames = new ArrayList<Object>();
                for (String s : strings) {
                    s = s.trim();
                    if (!s.isEmpty()) {
                        retrievedColumnNames.add(s);
                    }
                }
                
                if (retrievedColumnNames.size() > 0) {
                    JSONUtilities.safePut(options, "headerLines", 1);
                } else {
                    retrievedColumnNames = null;
                }
            }
        }
        
        final List<Object> columnNames = retrievedColumnNames;
        final LineNumberReader lnReader = new LineNumberReader(reader);
        
        TableDataReader dataReader = new TableDataReader() {
            boolean usedColumnNames = false;
            
            @Override
            public List<Object> getNextRowOfCells() throws IOException {
                if (columnNames != null && !usedColumnNames) {
                    usedColumnNames = true;
                    return columnNames;
                } else {
                    String line = lnReader.readLine();
                    if (line == null) {
                        return null;
                    } else {
                        return getCells(line, columnWidths);
                    }
                }
            }
        };
        
        TabularImportingParserBase.readTable(project, metadata, job, dataReader, fileSource, limit, options, exceptions);
        
        super.parseOneFile(project, metadata, job, fileSource, reader, limit, options, exceptions);
    }
    
    /**
     * Splits the line into columns
     * @param line Line to be split
     * @param widths array of integers with field sizes
     * @return
     */
    static private ArrayList<Object> getCells(String line, int[] widths) {
        ArrayList<Object> cells = new ArrayList<Object>();
        
        int columnStartCursor = 0;
        int columnEndCursor = 0;
        for (int width : widths) {
            if (columnStartCursor >= line.length()) {
                cells.add(null); //FIXME is adding a null cell (to represent no data) OK?
                continue;
            }
            
            columnEndCursor = columnStartCursor + width;
            
            if (columnEndCursor > line.length()) {
                columnEndCursor = line.length();
            }
            if (columnEndCursor <= columnStartCursor) {
                cells.add(null); //FIXME is adding a null cell (to represent no data, or a zero width column) OK? 
                continue;
            }
            
            cells.add(line.substring(columnStartCursor, columnEndCursor));
            
            columnStartCursor = columnEndCursor;
        }
        
        // Residual text
        if (columnStartCursor < line.length()) {
            cells.add(line.substring(columnStartCursor));
        }
        return cells;
    }
    
    static public int[] guessColumnWidths(File file, String encoding) {
        try {
            InputStream is = new FileInputStream(file);
            Reader reader = (encoding != null) ? new InputStreamReader(is, encoding) : new InputStreamReader(is);
            LineNumberReader lineNumberReader = new LineNumberReader(reader);

            try {
                int[] counts = null;
                int totalBytes = 0;
                int lineCount = 0;
                String s;
                while (totalBytes < 64 * 1024 &&
                       lineCount < 100 &&
                       (s = lineNumberReader.readLine()) != null) {
                    
                    totalBytes += s.length() + 1; // count the new line character
                    if (s.length() == 0) {
                        continue;
                    }
                    lineCount++;
                    
                    if (counts == null) {
                        counts = new int[s.length()];
                        for (int c = 0; c < counts.length; c++) {
                            counts[c] = 0;
                        }
                    }
                    
                    for (int c = 0; c < counts.length && c < s.length(); c++) {
                        char ch = s.charAt(c);
                        if (ch == ' ') {
                            counts[c]++;
                        }
                    }
                }
                
                if (counts != null && lineCount > 2) {
                    List<Integer> widths = new ArrayList<Integer>();
                    
                    int startIndex = 0;
                    for (int c = 0; c < counts.length; c++) {
                        int count = counts[c];
                        if (count == lineCount) {
                            widths.add(c - startIndex + 1);
                            startIndex = c + 1;
                        }
                    }
                    
                    for (int i = widths.size() - 2; i >= 0; i--) {
                        if (widths.get(i) == 1) {
                            widths.set(i + 1, widths.get(i + 1) + 1);
                            widths.remove(i);
                        }
                    }
                    
                    int[] widthA = new int[widths.size()];
                    for (int i = 0; i < widthA.length; i++) {
                        widthA[i] = widths.get(i);
                    }
                    return widthA;
                }
            } finally {
                lineNumberReader.close();
                reader.close();
                is.close();
            }
        } catch (UnsupportedEncodingException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
        return null;
    }
}
Don't guess field widths unless we have at least 3 lines - Investigation of #685 showed that single line files were being guessed as fixed field width 2013-03-04 23:47:06 +01:00			`package com.google.refine.importers;`

			`import java.io.File;`
			`import java.io.FileInputStream;`
			`import java.io.IOException;`
			`import java.io.InputStream;`
			`import java.io.InputStreamReader;`
			`import java.io.LineNumberReader;`
			`import java.io.Reader;`
			`import java.io.UnsupportedEncodingException;`
			`import java.util.ArrayList;`
			`import java.util.List;`

			`import org.json.JSONArray;`
			`import org.json.JSONObject;`

			`import com.google.refine.importing.ImportingJob;`
			`import com.google.refine.importing.ImportingUtilities;`
			`import com.google.refine.model.Project;`
data package metadata (#1398) * fix the appbundle issue #1209 * fix #1162 allow the JRE 9 * fix the package declarations * remove the _ from the method name * use the explicit scoping * remote extra ; * fix issued from codacy * fix issued from codacy * add preferences link to the index page * handle the empty user metadata * fix 'last modified' sorting issue #1307 * prevent overflow of the table. issue #1306 * add isoDateParser to sort the date * prevent overflow of the project index * remove sorter arrow for action columns * disable editing the internal metadata * adjust the width of the table * change MetaData to Metadata * change the filed name from rowNumber to rowCount * put back the incidently deleted gitignore * add double quote to prevent word splitting * UI improvement on metadata view and project list view * remove the date field in metadata * message notification of the free RAM. Issue #1295 * UI tuning for metadata view * shorten the ISO date to locale date format * Added translation using Weblate (Portuguese (Brazil)) * remove the rename link * Ignore empty language files introduced by Weblate * Add UI for Invert text filter * Backend support for Inverting Text search facets * Fix reset on text search facet * More succinct return statements * add tests for SetProjectMetadataCommand * Tidying up for Codacy * Added Tests for TextSearchFilter * Corrections for Codacy * More code tidy up * let the browser auto fit the table cell when resizing/zooming * fix import multiple excel with mulitple sheets issue #1328 * check if the project has the userMetadata * fix the unit test support multi files with multi tables for open office * prevent the same key for user metadata * replace _ with variable for exception * fix the no-undef issue * to adjust the width of transform dialog. issue #1332 * fix the row count refresh issue * extract method * move the log message * cosmatic changes for codacy * fix typo * bump to version 2.8 * .gitignore is now working * preview stage won't have the metadata populated, so protect NPE * Update README.md No more direct link to the last version tag, which will avoid having to think of updating the readme * refacotring the ProjectMetadata class * introduce the IMetadata interface * create submodule of dataschema * add back * setup lib for dataschema; upgrade the apache lang to lang3 * replace escape* functions from apache lang3 * replace the ProjectMetadata with IMetadata interface * add missing jars * set the IMetadata a field of Project * remove PreferenceStore out of Project model * fix test SetProjectMetadataCommandTests by casting * introdcue the AbstractMetadata * introdcue the AbstractMetadata * reorganize the metadata package * allow have mulitiple metadata for a project * support for mulitple metadata format * remove jdk7 since 'table schema' java implmentation only support jdk8+ * set execute permission for script * fix the Unit Test after Metadata refactoring * restore the apache lang2.5 since jetty 6.1.22 depend on it * add commons lang 2.5 jar * git submodule add https://github.com/frictionlessdata/datapackage-java * remove the metadata parameter from the ProjectManager.registerProject method * remove hashmap _projectsMetadata field from the ProjectManager and FileProjectManager * init the Project.metadataMap * fix Unit Test * restore the ProjectMetaData map to ProjectManager * put the ProjectMetaDta in place for ProjectManager and Project object * check null of singleton instead of create a constructor just for test * load the data package metadata * importing data package * importing data package * encapsulate the Package class into DataPackageMetadata * user _ to indicate the class fields * introduce base URL in order to download the data files * import data package UI and draft backend * import data package UI * fix typo * download the data set pointed from metadata resource * save and load the data package metadata * avoid magic string * package cleanup * set the java_version to 1.8 * set the min jdk to 1.8 * add the 3rd party src in the build.xml * skip the file selection page if only 1 DATA file * add files structure for json editor * seperate out the metadata file from the retrival file list * rename the OKF_METADATA to DATAPACKAGE_METADATA * clean up * implement GetMetadateCommand class * display the metadata in json format * git submodule update --remote --merge * adjust the setting after pulling from datapackage origin * fix the failed UT DateExtensionTests.testFetchCounts due to new json jar json-20160810.jar will complain: JSONObject["float"] not a string. * clean up the weird loop array syntax get complained * remove the unused constant * export in data package format * interface cleanup * fix UT * edit the metadata * add UT for SetMetadataCommand * fix UT for SetMetadataCommand * display the data package metadata link on the project index page * update submodule * log the exceptions * Ajv does not work properly, use the back end validation instead * enable the validation for jsoneditor * first draft of the data validation * create a map to hold the constraint and its handler * rename * support for minLength and maxLength from spec * add validate command * test the opeation instead of validate command * rename the UT * format the error message and push to the report * fix row number * add resource bundle for validator * inject the code of the constrains * make the StrSubstitutor works * extract the type and format information * add the customizedFormat to interface to allow format properly * get rid of magic string * take care of missing parts of the data package * implement RequiredConstraint * patch for number type * add max/min constraints * get the constrains directly from field * implement the PatternConstraint * suppress warning * fix the broken UT when expecting 2 digits fraction * handle the cast and type properly * fix the missing resource files for data package when run from command line * use the copy instead of copydir * add script for appveyor * update script for appveyor * do recursive clone * correct the git url * fix clone path * clone folder option does not work * will put another PR for this. delete for now * revert the interface method name * lazy loading the project data * disable the validate menu for now * add UT * assert UTs * add UT * fix #1386 * remove import * test the thread * Revert "test the thread" This reverts commit 779214160055afe3ccdcc18c57b0c7c72e87c824. * fix the URLCachingTest UT * define the template data package * tidy up the metadata interface * check the http response code * fix the package * display user friendly message when URL path is not reachable * populate the data package schema * Delete hs_err_pid15194.log * populate data package info * add username preference and it will be pulled as the creator of the metadata * undo the project.updateColumnChange() and start to introduce the fields into the existing core model * tightly integrate the data package metadata * tightly integrate the data package metadata for project level * remove the submodule * move the edit botton * clean up build * load the new property * load the project metadata * fix issues from codacy * remove unused fields and annotation * check the http response code firstly * import zipped data package * allow without keywords * process the zip data package from url * merge the tags * check store firstly * remove the table schema src * move the json schema files to schema dir * add comment * add comment * remove git moduels * add incidently deleted file * fix typo * remove SetMetadataCommand * revert change * merge from master 2018-02-02 14:24:19 +01:00			`import com.google.refine.model.medadata.ProjectMetadata;`
Don't guess field widths unless we have at least 3 lines - Investigation of #685 showed that single line files were being guessed as fixed field width 2013-03-04 23:47:06 +01:00			`import com.google.refine.util.JSONUtilities;`

			`public class FixedWidthImporter extends TabularImportingParserBase {`
			`public FixedWidthImporter() {`
			`super(false);`
			`}`

			`@Override`
			`public JSONObject createParserUIInitializationData(`
			`ImportingJob job, List<JSONObject> fileRecords, String format) {`
			`JSONObject options = super.createParserUIInitializationData(job, fileRecords, format);`
			`JSONArray columnWidths = new JSONArray();`
			`if (fileRecords.size() > 0) {`
			`JSONObject firstFileRecord = fileRecords.get(0);`
			`String encoding = ImportingUtilities.getEncoding(firstFileRecord);`
			`String location = JSONUtilities.getString(firstFileRecord, "location", null);`
			`if (location != null) {`
			`File file = new File(job.getRawDataDir(), location);`
			`int[] columnWidthsA = guessColumnWidths(file, encoding);`
			`if (columnWidthsA != null) {`
			`for (int w : columnWidthsA) {`
			`JSONUtilities.append(columnWidths, w);`
			`}`
			`}`
			`}`

			`JSONUtilities.safePut(options, "headerLines", 0);`
			`JSONUtilities.safePut(options, "columnWidths", columnWidths);`
Turn all import conversions off by default - fixes #478 2013-07-27 19:32:26 +02:00			`JSONUtilities.safePut(options, "guessCellValueTypes", false);`
Don't guess field widths unless we have at least 3 lines - Investigation of #685 showed that single line files were being guessed as fixed field width 2013-03-04 23:47:06 +01:00			`}`
			`return options;`
			`}`

			`@Override`
			`public void parseOneFile(`
			`Project project,`
			`ProjectMetadata metadata,`
			`ImportingJob job,`
			`String fileSource,`
			`Reader reader,`
			`int limit,`
			`JSONObject options,`
			`List<Exception> exceptions`
			`) {`
			`final int[] columnWidths = JSONUtilities.getIntArray(options, "columnWidths");`

			`List<Object> retrievedColumnNames = null;`
			`if (options.has("columnNames")) {`
			`String[] strings = JSONUtilities.getStringArray(options, "columnNames");`
			`if (strings.length > 0) {`
			`retrievedColumnNames = new ArrayList<Object>();`
			`for (String s : strings) {`
			`s = s.trim();`
			`if (!s.isEmpty()) {`
			`retrievedColumnNames.add(s);`
			`}`
			`}`

			`if (retrievedColumnNames.size() > 0) {`
			`JSONUtilities.safePut(options, "headerLines", 1);`
			`} else {`
			`retrievedColumnNames = null;`
			`}`
			`}`
			`}`

			`final List<Object> columnNames = retrievedColumnNames;`
			`final LineNumberReader lnReader = new LineNumberReader(reader);`

			`TableDataReader dataReader = new TableDataReader() {`
			`boolean usedColumnNames = false;`

			`@Override`
			`public List<Object> getNextRowOfCells() throws IOException {`
			`if (columnNames != null && !usedColumnNames) {`
			`usedColumnNames = true;`
			`return columnNames;`
			`} else {`
			`String line = lnReader.readLine();`
			`if (line == null) {`
			`return null;`
			`} else {`
			`return getCells(line, columnWidths);`
			`}`
			`}`
			`}`
			`};`

			`TabularImportingParserBase.readTable(project, metadata, job, dataReader, fileSource, limit, options, exceptions);`
add the import options to metadata 2017-10-22 05:33:19 +02:00
			`super.parseOneFile(project, metadata, job, fileSource, reader, limit, options, exceptions);`
Don't guess field widths unless we have at least 3 lines - Investigation of #685 showed that single line files were being guessed as fixed field width 2013-03-04 23:47:06 +01:00			`}`

			`/**`
			`* Splits the line into columns`
Remove references to obsolete splitIntoColumns option 2013-09-19 00:44:58 +02:00			`* @param line Line to be split`
			`* @param widths array of integers with field sizes`
Don't guess field widths unless we have at least 3 lines - Investigation of #685 showed that single line files were being guessed as fixed field width 2013-03-04 23:47:06 +01:00			`* @return`
			`*/`
			`static private ArrayList<Object> getCells(String line, int[] widths) {`
			`ArrayList<Object> cells = new ArrayList<Object>();`

			`int columnStartCursor = 0;`
			`int columnEndCursor = 0;`
			`for (int width : widths) {`
			`if (columnStartCursor >= line.length()) {`
			`cells.add(null); //FIXME is adding a null cell (to represent no data) OK?`
			`continue;`
			`}`

			`columnEndCursor = columnStartCursor + width;`

			`if (columnEndCursor > line.length()) {`
			`columnEndCursor = line.length();`
			`}`
			`if (columnEndCursor <= columnStartCursor) {`
			`cells.add(null); //FIXME is adding a null cell (to represent no data, or a zero width column) OK?`
			`continue;`
			`}`

			`cells.add(line.substring(columnStartCursor, columnEndCursor));`

			`columnStartCursor = columnEndCursor;`
			`}`

			`// Residual text`
			`if (columnStartCursor < line.length()) {`
			`cells.add(line.substring(columnStartCursor));`
			`}`
			`return cells;`
			`}`

			`static public int[] guessColumnWidths(File file, String encoding) {`
			`try {`
			`InputStream is = new FileInputStream(file);`
			`Reader reader = (encoding != null) ? new InputStreamReader(is, encoding) : new InputStreamReader(is);`
			`LineNumberReader lineNumberReader = new LineNumberReader(reader);`

			`try {`
			`int[] counts = null;`
			`int totalBytes = 0;`
			`int lineCount = 0;`
			`String s;`
			`while (totalBytes < 64 * 1024 &&`
			`lineCount < 100 &&`
			`(s = lineNumberReader.readLine()) != null) {`

			`totalBytes += s.length() + 1; // count the new line character`
			`if (s.length() == 0) {`
			`continue;`
			`}`
			`lineCount++;`

			`if (counts == null) {`
			`counts = new int[s.length()];`
			`for (int c = 0; c < counts.length; c++) {`
			`counts[c] = 0;`
			`}`
			`}`

			`for (int c = 0; c < counts.length && c < s.length(); c++) {`
			`char ch = s.charAt(c);`
			`if (ch == ' ') {`
			`counts[c]++;`
			`}`
			`}`
			`}`

			`if (counts != null && lineCount > 2) {`
			`List<Integer> widths = new ArrayList<Integer>();`

			`int startIndex = 0;`
			`for (int c = 0; c < counts.length; c++) {`
			`int count = counts[c];`
			`if (count == lineCount) {`
			`widths.add(c - startIndex + 1);`
			`startIndex = c + 1;`
			`}`
			`}`

			`for (int i = widths.size() - 2; i >= 0; i--) {`
			`if (widths.get(i) == 1) {`
			`widths.set(i + 1, widths.get(i + 1) + 1);`
			`widths.remove(i);`
			`}`
			`}`

			`int[] widthA = new int[widths.size()];`
			`for (int i = 0; i < widthA.length; i++) {`
			`widthA[i] = widths.get(i);`
			`}`
			`return widthA;`
			`}`
			`} finally {`
			`lineNumberReader.close();`
			`reader.close();`
			`is.close();`
			`}`
			`} catch (UnsupportedEncodingException e) {`
			`e.printStackTrace();`
			`} catch (IOException e) {`
			`e.printStackTrace();`
			`}`
			`return null;`
			`}`
			`}`