More checks in validation
This commit is contained in:
parent
9b79b8761d
commit
4452095538
11
CHANGELOG.md
11
CHANGELOG.md
@ -1,3 +1,14 @@
|
|||||||
|
|
||||||
|
## 1.18.2.0
|
||||||
|
|
||||||
|
* During validation, check the number of columns
|
||||||
|
* During validation, check the number of lines
|
||||||
|
* Validate train files
|
||||||
|
|
||||||
|
## 1.18.1.0
|
||||||
|
|
||||||
|
* During validation, check whether the maximum values is obtained with the expected data
|
||||||
|
|
||||||
## 1.18.0.0
|
## 1.18.0.0
|
||||||
|
|
||||||
* Add --validate option
|
* Add --validate option
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
name: geval
|
name: geval
|
||||||
version: 1.18.1.0
|
version: 1.18.2.0
|
||||||
synopsis: Machine learning evaluation tools
|
synopsis: Machine learning evaluation tools
|
||||||
description: Please see README.md
|
description: Please see README.md
|
||||||
homepage: http://github.com/name/project
|
homepage: http://github.com/name/project
|
||||||
|
@ -7,7 +7,9 @@ module GEval.Metric
|
|||||||
getMetricOrdering,
|
getMetricOrdering,
|
||||||
listOfAvailableMetrics,
|
listOfAvailableMetrics,
|
||||||
bestPossibleValue,
|
bestPossibleValue,
|
||||||
perfectOutLineFromExpectedLine)
|
perfectOutLineFromExpectedLine,
|
||||||
|
fixedNumberOfColumnsInExpected,
|
||||||
|
fixedNumberOfColumnsInInput)
|
||||||
where
|
where
|
||||||
|
|
||||||
import Data.Word
|
import Data.Word
|
||||||
@ -195,6 +197,17 @@ bestPossibleValue metric = case getMetricOrdering metric of
|
|||||||
TheLowerTheBetter -> 0.0
|
TheLowerTheBetter -> 0.0
|
||||||
TheHigherTheBetter -> 1.0
|
TheHigherTheBetter -> 1.0
|
||||||
|
|
||||||
|
fixedNumberOfColumnsInExpected :: Metric -> Bool
|
||||||
|
fixedNumberOfColumnsInExpected MAP = False
|
||||||
|
fixedNumberOfColumnsInExpected BLEU = False
|
||||||
|
fixedNumberOfColumnsInExpected GLEU = False
|
||||||
|
fixedNumberOfColumnsInExpected _ = True
|
||||||
|
|
||||||
|
fixedNumberOfColumnsInInput :: Metric -> Bool
|
||||||
|
fixedNumberOfColumnsInInput (SoftFMeasure _) = False
|
||||||
|
fixedNumberOfColumnsInInput (ProbabilisticSoftFMeasure _) = False
|
||||||
|
fixedNumberOfColumnsInInput _ = True
|
||||||
|
|
||||||
perfectOutLineFromExpectedLine :: Metric -> Text -> Text
|
perfectOutLineFromExpectedLine :: Metric -> Text -> Text
|
||||||
perfectOutLineFromExpectedLine (LogLossHashed _) t = t <> ":1.0"
|
perfectOutLineFromExpectedLine (LogLossHashed _) t = t <> ":1.0"
|
||||||
perfectOutLineFromExpectedLine (LikelihoodHashed _) t = t <> ":1.0"
|
perfectOutLineFromExpectedLine (LikelihoodHashed _) t = t <> ":1.0"
|
||||||
|
@ -23,7 +23,7 @@ import qualified Data.Conduit.Text as CT
|
|||||||
import Data.Conduit.Binary (sourceFile, sinkFile)
|
import Data.Conduit.Binary (sourceFile, sinkFile)
|
||||||
import Data.Conduit.AutoDecompress (autoDecompress)
|
import Data.Conduit.AutoDecompress (autoDecompress)
|
||||||
import Data.Conduit.SmartSource (compressedFilesHandled)
|
import Data.Conduit.SmartSource (compressedFilesHandled)
|
||||||
import Data.List (intercalate)
|
import Data.List (intercalate, nub)
|
||||||
import qualified Data.Text as T
|
import qualified Data.Text as T
|
||||||
|
|
||||||
import System.IO.Temp
|
import System.IO.Temp
|
||||||
@ -34,12 +34,15 @@ data ValidationException = NoChallengeDirectory FilePath
|
|||||||
| NoReadmeFile FilePath
|
| NoReadmeFile FilePath
|
||||||
| NoGitignoreFile FilePath
|
| NoGitignoreFile FilePath
|
||||||
| EmptyFile FilePath
|
| EmptyFile FilePath
|
||||||
|
| VaryingNumberOfLines
|
||||||
| NoTestDirectories
|
| NoTestDirectories
|
||||||
| TooManyInputFiles [FilePath]
|
| TooManyInputFiles [FilePath]
|
||||||
| TooManyExpectedFiles [FilePath]
|
| TooManyExpectedFiles [FilePath]
|
||||||
|
| TooManyTrainFiles [FilePath]
|
||||||
| OutputFileDetected [FilePath]
|
| OutputFileDetected [FilePath]
|
||||||
| CharacterCRDetected FilePath
|
| CharacterCRDetected FilePath
|
||||||
| SpaceSuffixDetect FilePath
|
| SpaceSuffixDetect FilePath
|
||||||
|
| VaryingNumberOfColumns FilePath
|
||||||
| BestPossibleValueNotObtainedWithExpectedData MetricValue MetricValue
|
| BestPossibleValueNotObtainedWithExpectedData MetricValue MetricValue
|
||||||
|
|
||||||
instance Exception ValidationException
|
instance Exception ValidationException
|
||||||
@ -51,12 +54,15 @@ instance Show ValidationException where
|
|||||||
show (NoReadmeFile filePath) = somethingWrongWithFilesMessage "No README.md file" filePath
|
show (NoReadmeFile filePath) = somethingWrongWithFilesMessage "No README.md file" filePath
|
||||||
show (NoGitignoreFile filePath) = somethingWrongWithFilesMessage "No .gitignore file" filePath
|
show (NoGitignoreFile filePath) = somethingWrongWithFilesMessage "No .gitignore file" filePath
|
||||||
show (EmptyFile filePath) = somethingWrongWithFilesMessage "Empty file" filePath
|
show (EmptyFile filePath) = somethingWrongWithFilesMessage "Empty file" filePath
|
||||||
|
show VaryingNumberOfLines = "The number of lines in input and expected file is not the same"
|
||||||
show NoTestDirectories = "No directories with test data, expected `dev-0` and/or `test-A` directory"
|
show NoTestDirectories = "No directories with test data, expected `dev-0` and/or `test-A` directory"
|
||||||
show (TooManyInputFiles filePaths) = somethingWrongWithFilesMessage "Too many input files" $ intercalate "`, `" filePaths
|
show (TooManyInputFiles filePaths) = somethingWrongWithFilesMessage "Too many input files" $ intercalate "`, `" filePaths
|
||||||
show (TooManyExpectedFiles filePaths) = somethingWrongWithFilesMessage "Too many expected files" $ intercalate "`, `" filePaths
|
show (TooManyExpectedFiles filePaths) = somethingWrongWithFilesMessage "Too many expected files" $ intercalate "`, `" filePaths
|
||||||
|
show (TooManyTrainFiles filePaths) = somethingWrongWithFilesMessage "Too many train files" $ intercalate "`, `" filePaths
|
||||||
show (OutputFileDetected filePaths) = somethingWrongWithFilesMessage "Output file/s detected" $ intercalate "`, `" filePaths
|
show (OutputFileDetected filePaths) = somethingWrongWithFilesMessage "Output file/s detected" $ intercalate "`, `" filePaths
|
||||||
show (CharacterCRDetected filePaths) = somethingWrongWithFilesMessage "Found CR (Carriage Return, 0x0D) character" filePaths
|
show (CharacterCRDetected filePaths) = somethingWrongWithFilesMessage "Found CR (Carriage Return, 0x0D) character" filePaths
|
||||||
show (SpaceSuffixDetect filePaths) = somethingWrongWithFilesMessage "Found space at the end of line" filePaths
|
show (SpaceSuffixDetect filePaths) = somethingWrongWithFilesMessage "Found space at the end of line" filePaths
|
||||||
|
show (VaryingNumberOfColumns filePaths) = somethingWrongWithFilesMessage "The file contains varying number of columns" filePaths
|
||||||
show (BestPossibleValueNotObtainedWithExpectedData expected got) = "The best possible value was not obtained with the expected data, expected: " ++ (show expected) ++ " , obtained: " ++ (show got)
|
show (BestPossibleValueNotObtainedWithExpectedData expected got) = "The best possible value was not obtained with the expected data, expected: " ++ (show expected) ++ " , obtained: " ++ (show got)
|
||||||
|
|
||||||
validationChallenge :: FilePath -> GEvalSpecification -> IO ()
|
validationChallenge :: FilePath -> GEvalSpecification -> IO ()
|
||||||
@ -69,7 +75,8 @@ validationChallenge challengeDirectory spec = do
|
|||||||
checkCorrectFile gitignoreFile
|
checkCorrectFile gitignoreFile
|
||||||
checkCorrectFile readmeFile
|
checkCorrectFile readmeFile
|
||||||
testDirectories <- findTestDirs challengeDirectory
|
testDirectories <- findTestDirs challengeDirectory
|
||||||
checkTestDirectories testDirectories
|
checkTestDirectories mainMetric testDirectories
|
||||||
|
checkTrainDirectory mainMetric challengeDirectory
|
||||||
|
|
||||||
mapM_ (runOnTest spec) testDirectories
|
mapM_ (runOnTest spec) testDirectories
|
||||||
|
|
||||||
@ -77,7 +84,7 @@ validationChallenge challengeDirectory spec = do
|
|||||||
configFile = challengeDirectory </> "config.txt"
|
configFile = challengeDirectory </> "config.txt"
|
||||||
gitignoreFile = challengeDirectory </> ".gitignore"
|
gitignoreFile = challengeDirectory </> ".gitignore"
|
||||||
readmeFile = challengeDirectory </> "README.md"
|
readmeFile = challengeDirectory </> "README.md"
|
||||||
|
mainMetric = head $ gesMetrics spec
|
||||||
|
|
||||||
checkCorrectFile :: FilePath -> IO ()
|
checkCorrectFile :: FilePath -> IO ()
|
||||||
checkCorrectFile filePath = do
|
checkCorrectFile filePath = do
|
||||||
@ -95,6 +102,19 @@ getFileLines file = runResourceT $ runConduit (sourceFile file
|
|||||||
.| CC.map T.unpack
|
.| CC.map T.unpack
|
||||||
.| CL.consume)
|
.| CL.consume)
|
||||||
|
|
||||||
|
countLines :: FilePath -> IO Int
|
||||||
|
countLines file = do
|
||||||
|
lines <- getFileLines file
|
||||||
|
return $ length lines
|
||||||
|
|
||||||
|
numberOfColumns :: FilePath -> IO [Int]
|
||||||
|
numberOfColumns file = runResourceT $ runConduit (sourceFile file
|
||||||
|
.| autoDecompress
|
||||||
|
.| CC.decodeUtf8
|
||||||
|
.| CT.lines
|
||||||
|
.| CC.map (\t -> length $ T.splitOn "\t" t)
|
||||||
|
.| CL.consume)
|
||||||
|
|
||||||
createPerfectOutputFromExpected :: Metric -> FilePath -> FilePath -> IO ()
|
createPerfectOutputFromExpected :: Metric -> FilePath -> FilePath -> IO ()
|
||||||
createPerfectOutputFromExpected metric expectedFile outFile = do
|
createPerfectOutputFromExpected metric expectedFile outFile = do
|
||||||
runResourceT $ runConduit $ (sourceFile expectedFile
|
runResourceT $ runConduit $ (sourceFile expectedFile
|
||||||
@ -116,6 +136,9 @@ findInputFiles = SFF.find never $ fileFilter defaultInputFile
|
|||||||
findOutputFiles :: FilePath -> IO [FilePath]
|
findOutputFiles :: FilePath -> IO [FilePath]
|
||||||
findOutputFiles = SFF.find never $ fileFilter "out*.tsv"
|
findOutputFiles = SFF.find never $ fileFilter "out*.tsv"
|
||||||
|
|
||||||
|
findTrainFiles :: FilePath -> IO [FilePath]
|
||||||
|
findTrainFiles = SFF.find never $ fileFilter "train.tsv"
|
||||||
|
|
||||||
findExpectedFiles :: FilePath -> IO [FilePath]
|
findExpectedFiles :: FilePath -> IO [FilePath]
|
||||||
findExpectedFiles = SFF.find never $ fileFilter defaultExpectedFile
|
findExpectedFiles = SFF.find never $ fileFilter defaultExpectedFile
|
||||||
|
|
||||||
@ -131,21 +154,28 @@ fileFilter fileName = (SFF.fileType ==? RegularFile) &&? (SFF.fileName ~~? fileN
|
|||||||
exts = Prelude.concat [ "(", intercalate "|" compressedFilesHandled, ")" ]
|
exts = Prelude.concat [ "(", intercalate "|" compressedFilesHandled, ")" ]
|
||||||
|
|
||||||
|
|
||||||
checkTestDirectories :: [FilePath] -> IO ()
|
checkTestDirectories :: Metric -> [FilePath] -> IO ()
|
||||||
checkTestDirectories [] = throwM NoTestDirectories
|
checkTestDirectories _ [] = throwM NoTestDirectories
|
||||||
checkTestDirectories directories = mapM_ checkTestDirectory directories
|
checkTestDirectories metric directories = mapM_ (checkTestDirectory metric) directories
|
||||||
|
|
||||||
checkTestDirectory :: FilePath -> IO ()
|
checkTestDirectory :: Metric -> FilePath -> IO ()
|
||||||
checkTestDirectory directoryPath = do
|
checkTestDirectory metric directoryPath = do
|
||||||
inputFiles <- findInputFiles directoryPath
|
inputFiles <- findInputFiles directoryPath
|
||||||
when (null inputFiles) $ throw $ NoInputFile inputFile
|
when (null inputFiles) $ throw $ NoInputFile inputFile
|
||||||
when (length inputFiles > 1) $ throw $ TooManyInputFiles inputFiles
|
when (length inputFiles > 1) $ throw $ TooManyInputFiles inputFiles
|
||||||
checkCorrectFile $ head inputFiles
|
checkCorrectFile $ head inputFiles
|
||||||
|
when (fixedNumberOfColumnsInInput metric) $ checkColumns $ head inputFiles
|
||||||
|
|
||||||
expectedFiles <- findExpectedFiles directoryPath
|
expectedFiles <- findExpectedFiles directoryPath
|
||||||
when (null expectedFiles) $ throw $ NoExpectedFile expectedFile
|
when (null expectedFiles) $ throw $ NoExpectedFile expectedFile
|
||||||
when (length expectedFiles > 1) $ throw $ TooManyExpectedFiles expectedFiles
|
when (length expectedFiles > 1) $ throw $ TooManyExpectedFiles expectedFiles
|
||||||
checkCorrectFile $ head expectedFiles
|
checkCorrectFile $ head expectedFiles
|
||||||
|
when (fixedNumberOfColumnsInExpected metric) $ checkColumns $ head expectedFiles
|
||||||
|
|
||||||
|
inputLines <- countLines $ head inputFiles
|
||||||
|
expectedLines <- countLines $ head expectedFiles
|
||||||
|
|
||||||
|
when (inputLines /= expectedLines) $ throw $ VaryingNumberOfLines
|
||||||
|
|
||||||
outputFiles <- findOutputFiles directoryPath
|
outputFiles <- findOutputFiles directoryPath
|
||||||
unless (null outputFiles) $ throw $ OutputFileDetected outputFiles
|
unless (null outputFiles) $ throw $ OutputFileDetected outputFiles
|
||||||
@ -153,6 +183,24 @@ checkTestDirectory directoryPath = do
|
|||||||
inputFile = directoryPath </> defaultInputFile
|
inputFile = directoryPath </> defaultInputFile
|
||||||
expectedFile = directoryPath </> defaultExpectedFile
|
expectedFile = directoryPath </> defaultExpectedFile
|
||||||
|
|
||||||
|
checkTrainDirectory :: Metric -> FilePath -> IO ()
|
||||||
|
checkTrainDirectory metric challengeDirectory = do
|
||||||
|
let trainDirectory = challengeDirectory </> "train"
|
||||||
|
whenM (doesDirectoryExist trainDirectory) $ do
|
||||||
|
trainFiles <- findTrainFiles trainDirectory
|
||||||
|
when (null trainFiles) $ throw $ NoInputFile "train.tsv"
|
||||||
|
when (length trainFiles > 1) $ throw $ TooManyTrainFiles trainFiles
|
||||||
|
let [trainFile] = trainFiles
|
||||||
|
checkCorrectFile trainFile
|
||||||
|
when (fixedNumberOfColumnsInInput metric && fixedNumberOfColumnsInExpected metric) $ do
|
||||||
|
checkColumns trainFile
|
||||||
|
|
||||||
|
checkColumns :: FilePath -> IO ()
|
||||||
|
checkColumns filePath = do
|
||||||
|
columns <- numberOfColumns filePath
|
||||||
|
let uniqueColumns = nub columns
|
||||||
|
when (length uniqueColumns > 1) $ throw $ VaryingNumberOfColumns filePath
|
||||||
|
|
||||||
runOnTest :: GEvalSpecification -> FilePath -> IO ()
|
runOnTest :: GEvalSpecification -> FilePath -> IO ()
|
||||||
runOnTest spec testPath = do
|
runOnTest spec testPath = do
|
||||||
[expectedFile] <- findExpectedFiles testPath
|
[expectedFile] <- findExpectedFiles testPath
|
||||||
|
Loading…
Reference in New Issue
Block a user