Skip to content

Commit

Permalink
Use xmlbf for XML parsing (#67)
Browse files Browse the repository at this point in the history
A new implementation of the RDF/XML parser, using the xmlbf library. The previous RDF/XML parser was based on HXT, but failed 65 of the W3C parsing unit tests. This new xmlbf based parser fixes all these failing tests (all but one, being worked on), and is ~25% faster too.
  • Loading branch information
wismill authored and robstewart57 committed Jun 3, 2019
1 parent 0691d7e commit eca4dae
Show file tree
Hide file tree
Showing 40 changed files with 1,471 additions and 937 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ TAGS
*.backup
/.cabal-sandbox
cabal.sandbox.config
cabal.project.local
countries.ttl
*.prof
bench/MainCriterion
Expand Down
65 changes: 65 additions & 0 deletions .hlint.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# HLint configuration file
# https://github.com/ndmitchell/hlint
##########################

# This file contains a template configuration file, which is typically
# placed as .hlint.yaml in the root of your project


# Specify additional command line arguments
#
# - arguments: [--color, --cpp-simple, -XQuasiQuotes]


# Control which extensions/flags/modules/functions can be used
#
# - extensions:
# - default: false # all extension are banned by default
# - name: [PatternGuards, ViewPatterns] # only these listed extensions can be used
# - {name: CPP, within: CrossPlatform} # CPP can only be used in a given module
#
# - flags:
# - {name: -w, within: []} # -w is allowed nowhere
#
# - modules:
# - {name: [Data.Set, Data.HashSet], as: Set} # if you import Data.Set qualified, it must be as 'Set'
# - {name: Control.Arrow, within: []} # Certain modules are banned entirely
#
# - functions:
# - {name: unsafePerformIO, within: []} # unsafePerformIO can only appear in no modules


# Add custom hints for this project
#
# Will suggest replacing "wibbleMany [myvar]" with "wibbleOne myvar"
# - error: {lhs: "wibbleMany [x]", rhs: wibbleOne x}


# Turn on hints that are off by default
#
# Ban "module X(module X) where", to require a real export list
# - warn: {name: Use explicit module export list}
#
# Replace a $ b $ c with a . b $ c
# - group: {name: dollar, enabled: true}
#
# Generalise map to fmap, ++ to <>
- group: {name: generalise, enabled: true}


# Ignore some builtin hints
# - ignore: {name: Use let}
# - ignore: {name: Use const, within: SpecialModule} # Only within certain modules


# Define some custom infix operators
# - fixity: infixr 3 ~^#^~


# To generate a suitable file for HLint do:
# $ hlint --default > .hlint.yaml

- ignore: {name: Eta reduce}
- ignore: {name: Redundant bracket}
- ignore: {name: Reduce duplication}
- ignore: {name: Use camelCase}
8 changes: 4 additions & 4 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -81,10 +81,6 @@ matrix:
compiler: ": #stack 7.10.3"
addons: {apt: {packages: [libgmp-dev]}}

- env: BUILD=stack ARGS="--resolver lts-7"
compiler: ": #stack 8.0.1"
addons: {apt: {packages: [libgmp-dev]}}

- env: BUILD=stack ARGS="--resolver lts-9"
compiler: ": #stack 8.0.2"
addons: {apt: {packages: [libgmp-dev]}}
Expand All @@ -97,6 +93,10 @@ matrix:
compiler: ": #stack 8.4.3"
addons: {apt: {packages: [libgmp-dev]}}

- env: BUILD=stack ARGS="--resolver lts-13"
compiler: ": #stack 8.6.5"
addons: {apt: {packages: [libgmp-dev]}}

# Nightly builds are allowed to fail
# - env: BUILD=stack ARGS="--resolver nightly"
# compiler: ": #stack nightly"
Expand Down
95 changes: 54 additions & 41 deletions bench/MainCriterion.hs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
module Main where

import Prelude hiding (readFile)
import Data.Semigroup (Semigroup(..))
import Criterion
import Criterion.Types
import Criterion.Main
Expand All @@ -18,21 +19,15 @@ import Control.DeepSeq (NFData)
-- $ gzip -d bills.099.actions.rdf.gz

parseXmlRDF :: Rdf a => T.Text -> RDF a
parseXmlRDF s =
let (Right rdf) = parseString (XmlParser Nothing Nothing) s
in rdf
parseXmlRDF = either (error . show) id . parseString (XmlParser Nothing Nothing)
{-# INLINE parseXmlRDF #-}

parseNtRDF :: Rdf a => T.Text -> RDF a
parseNtRDF s =
let (Right rdf) = parseString NTriplesParser s
in rdf
parseNtRDF = either (error . show) id . parseString NTriplesParser
{-# INLINE parseNtRDF #-}

parseTtlRDF :: Rdf a => T.Text -> RDF a
parseTtlRDF s =
let (Right rdf) = parseString (TurtleParser Nothing Nothing) s
in rdf
parseTtlRDF = either (error . show) id . parseString (TurtleParser Nothing Nothing)
{-# INLINE parseTtlRDF #-}

queryGr :: Rdf a => (Maybe Node,Maybe Node,Maybe Node,RDF a) -> [Triple]
Expand All @@ -48,15 +43,19 @@ main :: IO ()
main = defaultMainWith
(defaultConfig {resamples = 100})
[ env
-- [FIXME] Do not rely on system's defaults to read files.
(do fawltyContentTurtle <- readFile "data/ttl/fawlty1.ttl"
fawltyContentNTriples <- readFile "data/nt/all-fawlty-towers.nt"
rdf1' <- parseFile (XmlParser Nothing Nothing) xmlFile
rdf2' <- parseFile (XmlParser Nothing Nothing) xmlFile
let rdf1 = either (error . show) id rdf1' :: RDF TList
xmlContent <- readFile xmlFile
let rdf1' = parseString (XmlParser Nothing Nothing) xmlContent
rdf2' = parseString (XmlParser Nothing Nothing) xmlContent
rdf3' =parseString (XmlParser Nothing Nothing) xmlContent
rdf1 = either (error . show) id rdf1' :: RDF TList
rdf2 = either (error . show) id rdf2' :: RDF AdjHashMap
rdf3 = either (error . show) id rdf3' :: RDF AlgebraicGraph
triples = triplesOf rdf1
return (rdf1, rdf2, triples, fawltyContentNTriples, fawltyContentTurtle)) $
\ ~(triplesList, adjMap, triples, fawltyContentNTriples, fawltyContentTurtle) ->
return (rdf1, rdf2, rdf3, triples, fawltyContentNTriples, fawltyContentTurtle, xmlContent)) $
\ ~(triplesList, adjMap, algGraph, triples, fawltyContentNTriples, fawltyContentTurtle, xmlContent) ->
bgroup
"rdf4h"
[ bgroup
Expand All @@ -81,42 +80,56 @@ main = defaultMainWith
let res = parseString (TurtleParserCustom Nothing Nothing Attoparsec) t :: Either ParseFailure (RDF TList)
in either (error . show) id res
) fawltyContentTurtle
, bench "xml-xmlbf" $
nf (\t ->
let res = parseString (XmlParser Nothing Nothing) t :: Either ParseFailure (RDF TList)
in either (error . show) id res
) xmlContent
, bench "xml-xht" $
nf (\t ->
let res = parseString (XmlParserHXT Nothing Nothing) t :: Either ParseFailure (RDF TList)
in either (error . show) id res
) xmlContent
]
,
bgroup
"query"
(queryBench "TList" triplesList ++
queryBench "AdjHashMap" adjMap
-- queryBench "SP" mapSP ++ queryBench "HashSP" hashMapSP
(queryBench "TList" triplesList <>
queryBench "AdjHashMap" adjMap <>
queryBench "AlgebraicGraph" algGraph
-- queryBench "SP" mapSP <> queryBench "HashSP" hashMapSP
)
, bgroup
"select"
(selectBench "TList" triplesList ++
selectBench "AdjHashMap" adjMap
-- selectBench "SP" mapSP ++ selectBench "HashSP" hashMapSP
(selectBench "TList" triplesList <>
selectBench "AdjHashMap" adjMap <>
selectBench "AlgebraicGraph" algGraph
-- selectBench "SP" mapSP <> selectBench "HashSP" hashMapSP
)
, bgroup
"add-remove-triples"
(addRemoveTriples "TList" triples (empty :: RDF TList) triplesList
++ addRemoveTriples "AdjHashMap" triples (empty :: RDF AdjHashMap) adjMap
(addRemoveTriples "TList" triples (empty :: RDF TList) triplesList <>
addRemoveTriples "AdjHashMap" triples (empty :: RDF AdjHashMap) adjMap <>
addRemoveTriples "AlgebraicGraph" triples (empty :: RDF AlgebraicGraph) algGraph
)
, bgroup
"count_triples"
[ bench "TList" (nf (length . triplesOf) triplesList)
, bench "AdjHashMap" (nf (length . triplesOf) adjMap)
, bench "AlgebraicGraph" (nf (length . triplesOf) algGraph)
]
]
]

selectBench :: Rdf a => String -> RDF a -> [Benchmark]
selectBench label gr =
[ bench (label ++ " SPO") $ nf selectGr (subjSelect,predSelect,objSelect,gr)
, bench (label ++ " SP") $ nf selectGr (subjSelect,predSelect,selectNothing,gr)
, bench (label ++ " S") $ nf selectGr (subjSelect,selectNothing,selectNothing,gr)
, bench (label ++ " PO") $ nf selectGr (selectNothing,predSelect,objSelect,gr)
, bench (label ++ " SO") $ nf selectGr (subjSelect,selectNothing,objSelect,gr)
, bench (label ++ " P") $ nf selectGr (selectNothing,predSelect,selectNothing,gr)
, bench (label ++ " O") $ nf selectGr (selectNothing,selectNothing,objSelect,gr)
[ bench (label <> " SPO") $ nf selectGr (subjSelect,predSelect,objSelect,gr)
, bench (label <> " SP") $ nf selectGr (subjSelect,predSelect,selectNothing,gr)
, bench (label <> " S") $ nf selectGr (subjSelect,selectNothing,selectNothing,gr)
, bench (label <> " PO") $ nf selectGr (selectNothing,predSelect,objSelect,gr)
, bench (label <> " SO") $ nf selectGr (subjSelect,selectNothing,objSelect,gr)
, bench (label <> " P") $ nf selectGr (selectNothing,predSelect,selectNothing,gr)
, bench (label <> " O") $ nf selectGr (selectNothing,selectNothing,objSelect,gr)
]

subjSelect, predSelect, objSelect, selectNothing :: Maybe (Node -> Bool)
Expand All @@ -133,25 +146,25 @@ queryNothing = Nothing

queryBench :: Rdf a => String -> RDF a -> [Benchmark]
queryBench label gr =
[ bench (label ++ " SPO") $ nf queryGr (subjQuery,predQuery,objQuery,gr)
, bench (label ++ " SP") $ nf queryGr (subjQuery,predQuery,queryNothing,gr)
, bench (label ++ " S") $ nf queryGr (subjQuery,queryNothing,queryNothing,gr)
, bench (label ++ " PO") $ nf queryGr (queryNothing,predQuery,objQuery,gr)
, bench (label ++ " SO") $ nf queryGr (subjQuery,queryNothing,objQuery,gr)
, bench (label ++ " P") $ nf queryGr (queryNothing,predQuery,queryNothing,gr)
, bench (label ++ " O") $ nf queryGr (queryNothing,queryNothing,objQuery,gr)
[ bench (label <> " SPO") $ nf queryGr (subjQuery,predQuery,objQuery,gr)
, bench (label <> " SP") $ nf queryGr (subjQuery,predQuery,queryNothing,gr)
, bench (label <> " S") $ nf queryGr (subjQuery,queryNothing,queryNothing,gr)
, bench (label <> " PO") $ nf queryGr (queryNothing,predQuery,objQuery,gr)
, bench (label <> " SO") $ nf queryGr (subjQuery,queryNothing,objQuery,gr)
, bench (label <> " P") $ nf queryGr (queryNothing,predQuery,queryNothing,gr)
, bench (label <> " O") $ nf queryGr (queryNothing,queryNothing,objQuery,gr)
]

addRemoveTriples :: (NFData a,NFData (RDF a), Rdf a) => String -> Triples -> RDF a -> RDF a -> [Benchmark]
addRemoveTriples :: (NFData (RDF a), Rdf a) => String -> Triples -> RDF a -> RDF a -> [Benchmark]
addRemoveTriples lbl triples emptyGr populatedGr =
[ bench (lbl ++ "-add-triples") $ nf addTriples (triples,emptyGr)
, bench (lbl ++ "-remove-triples") $ nf removeTriples (triples,populatedGr)
[ bench (lbl <> "-add-triples") $ nf addTriples (triples,emptyGr)
, bench (lbl <> "-remove-triples") $ nf removeTriples (triples,populatedGr)
]

addTriples :: Rdf a => (Triples,RDF a) -> RDF a
addTriples (triples,emptyGr) =
foldr (\t g -> addTriple g t) emptyGr triples
foldr (flip addTriple) emptyGr triples

removeTriples :: Rdf a => (Triples,RDF a) -> RDF a
removeTriples (triples,populatedGr) =
foldr (\t g -> removeTriple g t) populatedGr triples
foldr (flip removeTriple) populatedGr triples
2 changes: 1 addition & 1 deletion data/ttl/conformance/test-00.out
Original file line number Diff line number Diff line change
@@ -1 +1 @@
_:genid1 <http://www.w3.org/2001/sw/DataAccess/df1/tests/#x> <http://www.w3.org/2001/sw/DataAccess/df1/tests/#y> .
_:genid1 <http://www.w3.org/2001/sw/DataAccess/df1/tests/test-00.ttl#x> <http://www.w3.org/2001/sw/DataAccess/df1/tests/test-00.ttl#y> .
6 changes: 3 additions & 3 deletions examples/ESWC.hs
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,16 @@ heldByProp = "swc:heldBy"
eswcCommitteeMembers :: RDF TList -> [T.Text]
eswcCommitteeMembers graph =
let triples = query graph (Just (unode eswcCommitteeURI)) (Just (unode heldByProp)) Nothing
memberURIs = map objectOf triples
in map
memberURIs = fmap objectOf triples
in fmap
(\memberURI ->
let (LNode (PlainL (firstName::T.Text))) =
objectOf $ head $ query graph (Just memberURI) (Just (unode "foaf:firstName")) Nothing
(LNode (PlainL lastName)) =
objectOf $ head $ query graph (Just memberURI) (Just (unode "foaf:lastName")) Nothing
in (T.append firstName (T.append (T.pack " ") lastName)))
memberURIs

main :: IO ()
main = do
result <- parseURL
Expand Down
2 changes: 1 addition & 1 deletion examples/ParseURLs.hs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ timBernersLee :: IO ()
timBernersLee = do
Right (rdf::RDF TList) <- parseURL (XmlParser Nothing Nothing) "http://www.w3.org/People/Berners-Lee/card.rdf"
let ts = query rdf (Just (UNode "http://www.w3.org/2011/Talks/0331-hyderabad-tbl/data#talk")) (Just (UNode "dct:title")) Nothing
let talks = map (\(Triple _ _ (LNode (PlainL s))) -> s) ts
let talks = fmap (\(Triple _ _ (LNode (PlainL s))) -> s) ts
print talks

main :: IO ()
Expand Down
Loading

0 comments on commit eca4dae

Please sign in to comment.