;;; -*- Mode: Scheme; scheme48-package: rdf-turtle-parser -*-
;;;; Schemantic Web
;;;; RDF Turtle Parser
;;; This code is written by Taylor R. Campbell and placed in the Public
;;; Domain. All warranties are disclaimed.
;;; The version of RDF Turtle parsed by this code is described at
;;; .
;;; The parser uses the noise repetition operator because we pass each
;;; triple off to a user-supplied triple handler, and any result is
;;; accumulated in the user's state, rather than using the parser's
;;; value to accumulate anything.
(define-parser turtle-parser:document
(parser:sequence
(parser:noise:repeated-until (parser:end) turtle-parser:statement)
(parser:map turtle-context/user-state (parser:context))))
(define-parser turtle-parser:statement
(parser:choice (parser:sequence (parser:choice turtle-parser:directive
turtle-parser:triples)
turtle-parser:ws*
(parser:char= #\.)
turtle-parser:ws*)
turtle-parser:ws+))
(define-parser turtle-parser:directive
(*parser
((parser:string= "@prefix"))
(turtle-parser:ws+)
(prefix-name (parser:optional #f turtle-parser:prefix-name))
((parser:char= #\:))
(turtle-parser:ws+)
(prefix-expansion turtle-parser:uri-ref)
(turtle:add-prefix-expansion prefix-name prefix-expansion)))
(define-parser turtle-parser:triples
(*parser
(subject turtle-parser:subject)
(turtle-parser:ws+)
(turtle-parser:predicate/object-list subject)))
;;;; Predicate/Object Lists
(define-parser (turtle-parser:predicate/object-list subject)
(*parser
(predicate turtle-parser:verb)
(turtle-parser:ws+)
((turtle-parser:object-list subject predicate))
(turtle-parser:predicate/object-list-continuation subject)))
(define-parser turtle-parser:verb
(parser:choice (parser:backtrackable
(*parser
((parser:char= #\a))
((parser:peek turtle-parser:ws))
(parser:return rdf:type)))
turtle-parser:predicate))
(define-parser (turtle-parser:object-list subject predicate)
(let ((object-parser (turtle-parser:object->triple subject predicate)))
(*parser
(object-parser)
(turtle-parser:ws*)
(parser:noise:repeated
(*parser
((parser:char= #\,))
(turtle-parser:ws*)
(object-parser)
turtle-parser:ws*)))))
(define-parser (turtle-parser:predicate/object-list-continuation subject)
(parser:optional-noise
(*parser
((parser:char= #\;))
(turtle-parser:ws*)
(parser:optional-noise
(*parser
(predicate turtle-parser:verb)
(turtle-parser:ws+)
((turtle-parser:object-list subject predicate))
(turtle-parser:predicate/object-list-continuation subject))))))
(define-parser (turtle-parser:object->triple subject predicate)
(*parser (object turtle-parser:object)
(turtle:add-triple (make-rdf-triple subject predicate object))))
;;;; Resources
(define-parser turtle-parser:subject
(parser:choice turtle-parser:resource
turtle-parser:blank))
(define-parser turtle-parser:predicate
turtle-parser:resource)
(define-parser turtle-parser:object
(parser:choice turtle-parser:literal
turtle-parser:blank
turtle-parser:resource))
(define-parser turtle-parser:resource
(*parser (uri-ref
(parser:choice turtle-parser:uri-ref turtle-parser:qname))
(if (match-string? uri-matcher:uri-reference uri-ref)
(parser:return (string->rdf-uri-ref uri-ref))
(parser:error
(string-append "Malformed URI reference `" uri-ref "'")))))
(define-parser turtle-parser:uri-ref
(parser:bracketed-string (parser:char= #\<) (parser:char= #\>)
(turtle-parser:string-char turtle-char-set:ucharacter
turtle-string-escapes:uri-ref)))
(define-parser turtle-parser:literal
(parser:choice turtle-parser:string
turtle-parser:number
turtle-parser:boolean))
(define-parser turtle-parser:string
(*parser (lexical-form turtle-parser:quoted-string)
(parser:choice
(*parser (datatype-uri turtle-parser:literal-datatype-uri)
(parser:return (make-rdf-typed-literal lexical-form datatype-uri)))
(*parser (language-tag
(parser:choice turtle-parser:literal-language-tag
(parser:return #f)))
(parser:return (make-rdf-plain-literal lexical-form language-tag))))))
(define-parser turtle-parser:literal-language-tag
(parser:sequence
(parser:char= #\@)
(parser:match->string
(matcher:sequence
(matcher:at-least 1
(matcher:char-in-set turtle-char-set:language-initial))
(matcher:repeated
(matcher:sequence
(matcher:char= #\-)
(matcher:at-least 1
(matcher:char-in-set turtle-char-set:language-trailing))))))))
(define-parser turtle-parser:literal-datatype-uri
(parser:sequence (parser:string= "^^") turtle-parser:resource))
(define-parser turtle-parser:boolean
(*parser (boolean
(parser:backtrackable
(parser:choice (parser:string= "true")
(parser:string= "false"))))
(parser:return (make-rdf-typed-literal boolean xsd:boolean))))
;;;;; Numbers
(define-parser turtle-parser:number
(parser:sequence (parser:choice (parser:char= #\+)
(parser:char= #\-)
(parser:char-in-set char-set:digit))
(parser:error "Number parsing is not yet supported.")))
;;;;; Strings
(define-parser turtle-parser:quoted-string
(parser:sequence
(parser:char= #\")
(parser:choice
(parser:sequence (parser:backtrackable (parser:string= "\"\""))
turtle-parser:long-string-contents)
turtle-parser:short-string-contents)))
(define-parser turtle-parser:long-string-contents
(parser:string:repeated-until
(parser:backtrackable (parser:string= "\"\"\""))
(turtle-parser:string-char turtle-char-set:lcharacter
turtle-string-escapes:long-string)))
(define-parser turtle-parser:short-string-contents
(parser:string:repeated-until (parser:char= #\")
(turtle-parser:string-char turtle-char-set:scharacter
turtle-string-escapes:short-string)))
(define-parser (turtle-parser:string-char char-set escapes)
(parser:choice (*parser (char (parser:char= #\\))
(turtle-parser:string-escape escapes))
(parser:char-in-set char-set)))
(define-parser (turtle-parser:string-escape escapes)
(*parser (escape-char (parser:char))
(cond ((char=? escape-char #\u) (turtle-parser:unicode-escape 4))
((char=? escape-char #\U) (turtle-parser:unicode-escape 8))
((assv escape-char escapes)
=> (lambda (entry)
(parser:return (cadr entry))))
(else
(parser:error
(string-append "Invalid string escape `\\"
(string escape-char)
"'"))))))
(define-parser (turtle-parser:unicode-escape length)
(*parser (hex-string (parser:hex-string length))
(parser:return
(let ((number (string->number hex-string #x10)))
(if (< number ascii-limit)
(ascii->char number)
#\?))))) ;++ fix
(define turtle-string-escapes
`((#\\ #\\)
(#\t ,(ascii->char #x09)) ;Horizontal tab
(#\n ,(ascii->char #x0A)) ;Line feed
(#\r ,(ascii->char #x0D)))) ;Carriage return
(define turtle-string-escapes:long-string
`((#\" #\")
,@turtle-string-escapes))
(define turtle-string-escapes:short-string
turtle-string-escapes:long-string)
(define turtle-string-escapes:uri-ref
`((#\> #\>)
,@turtle-string-escapes))
;;;;; Blank Nodes
(define-parser turtle-parser:blank
(parser:choice turtle-parser:bnode:named
turtle-parser:bnode:empty
turtle-parser:bnode:compound
turtle-parser:bnode:collection))
(define-parser turtle-parser:bnode:named
(*parser (node-id turtle-parser:node-id)
(parser:return (make-rdf-bnode node-id))))
(define-parser turtle-parser:node-id
(parser:sequence (parser:string= "_:") turtle-parser:name))
(define-parser turtle-parser:bnode:empty
(parser:backtrackable
;; This is a little silly, but the Turtle specification does not
;; explictly allow a space between these brackets.
(parser:sequence (parser:string= "[]") turtle:new-anonymous-bnode)))
(define-parser turtle-parser:bnode:compound
(parser:bracketed
(parser:sequence (parser:char= #\[) turtle-parser:ws*)
(parser:sequence turtle-parser:ws* (parser:char= #\]))
(*parser (subject turtle:new-anonymous-bnode)
((turtle-parser:predicate/object-list subject))
(parser:return subject))))
;;; The following ugliness is the obvious recursive parser translated
;;; by hand according to the tail recursion modulo CONS pattern.
(define-parser turtle-parser:bnode:collection
(*parser ((parser:char= #\( ))
(turtle-parser:ws*)
(parser:choice
(parser:sequence (parser:char= #\) ) (parser:return rdf:nil))
(*parser (bnode turtle:new-anonymous-bnode)
(let loop ((pair bnode))
(*parser
((turtle-parser:object->triple pair rdf:first))
(turtle-parser:ws*)
(parser:choice
(*parser
((parser:char= #\) ))
((turtle:add-triple (make-rdf-triple pair rdf:rest rdf:nil)))
(parser:return bnode))
(*parser
(rest turtle:new-anonymous-bnode)
((turtle:add-triple (make-rdf-triple pair rdf:rest rest)))
(loop rest)))))))))
;;;; Names
(define-parser turtle-parser:qname
(*parser
(prefix-name (parser:optional #f turtle-parser:prefix-name))
((parser:char= #\:))
(suffix-text (parser:optional #f turtle-parser:name))
(prefix-expansion (turtle:expand-prefix prefix-name))
(if prefix-expansion
(parser:return
(if suffix-text
(string-append prefix-expansion suffix-text)
prefix-expansion))
(parser:error
(string-append "Unknown prefix `" prefix-name "'")))))
(define-parser turtle-parser:prefix-name
(parser:match->string
(matcher:sequence
(matcher:char-in-set (char-set-delete turtle-char-set:name-initial #\_))
(matcher:repeated (matcher:char-in-set turtle-char-set:name-trailing)))))
(define-parser turtle-parser:name
(parser:match->string
(matcher:sequence
(matcher:char-in-set turtle-char-set:name-initial)
(matcher:repeated (matcher:char-in-set turtle-char-set:name-trailing)))))
;;;; Miscellaneous
(define-parser turtle-parser:ws
(parser:choice (parser:char-in-set turtle-char-set:ws)
turtle-parser:comment))
(define-parser turtle-parser:ws*
(parser:noise:repeated turtle-parser:ws))
(define-parser turtle-parser:ws+
(parser:noise:at-least 1 turtle-parser:ws))
(define-parser turtle-parser:comment
(parser:sequence
(parser:char= #\#)
(parser:noise:repeated
(parser:char-not-in-set turtle-char-set:line-break))))
(define-parser (parser:hex-string length)
(parser:string:exactly length (parser:char-in-set char-set:hex-digit)))
;;;; Turtle Character Sets
;;; We use SRFI 14's UCS-RANGE->CHAR-SET several times here. The upper
;;; bounds are all one off from what you'll find in the Turtle
;;; specification, because, for whatever stupid reason, SRFI 14's
;;; UCS-RANGE->CHAR-SET works with exclusive upper bounds. I am sorry.
(define turtle-char-set:line-break
(char-set (ascii->char #x0A) ;Line feed
(ascii->char #x0D))) ;Carriage return
(define turtle-char-set:ws
(char-set (ascii->char #x09) ;Horizontal tab
(ascii->char #x0A) ;Line feed
(ascii->char #x0D) ;Carriage return
(ascii->char #x20))) ;Horizontal space
(define turtle-char-set:character
(char-set-union (ucs-range->char-set #x20 #x5C)
(ucs-range->char-set #x5D #x110000)))
(define turtle-char-set:ucharacter
(char-set-delete turtle-char-set:character #\>))
(define turtle-char-set:scharacter
(char-set-delete turtle-char-set:character #\"))
(define turtle-char-set:lcharacter
(char-set-adjoin turtle-char-set:character
(ascii->char #x09) ;Horizontal tab
(ascii->char #x0A) ;Carriage return
(ascii->char #x0D))) ;Line feed
(define turtle-char-set:language-initial
char-set:lower-case)
(define turtle-char-set:language-trailing
(char-set-union char-set:lower-case
char-set:digit))
(define turtle-char-set:name-initial
(char-set-union (char-set #\_)
char-set:letter
(ucs-range->char-set #x00C0 #x00D7)
(ucs-range->char-set #x00D8 #x00F7)
(ucs-range->char-set #x00F8 #x0300)
(ucs-range->char-set #x0370 #x037E)
(ucs-range->char-set #x037F #x2000)
(ucs-range->char-set #x200C #x200E)
(ucs-range->char-set #x2070 #x2190)
(ucs-range->char-set #x2C00 #x2FF0)
(ucs-range->char-set #x3001 #xD800)
(ucs-range->char-set #xF900 #xFDD0)
(ucs-range->char-set #xFDF0 #xFFFE)
(ucs-range->char-set #x10000 #xF0000)))
(define turtle-char-set:name-trailing
(char-set-union turtle-char-set:name-initial
(char-set #\- (ascii->char #x00B7))
char-set:digit
(ucs-range->char-set #x0300 #x0370)
(ucs-range->char-set #x203F #x2041)))
(define-record-type
(%make-turtle-context triple-handler user-state prefix-map bnode-number)
turtle-context?
(triple-handler turtle-context/triple-handler)
(user-state turtle-context/user-state)
(prefix-map turtle-context/prefix-map)
(bnode-number turtle-context/bnode-number))
(define (make-turtle-parser-context triple-handler initial-user-state)
(%make-turtle-context triple-handler initial-user-state '() 0))
(define (turtle-context/increment-bnode-number context)
(%make-turtle-context (turtle-context/triple-handler context)
(turtle-context/user-state context)
(turtle-context/prefix-map context)
(+ 1 (turtle-context/bnode-number context))))
(define (turtle-context/add-triple context triple)
(%make-turtle-context (turtle-context/triple-handler context)
((turtle-context/triple-handler context)
triple
(turtle-context/user-state context))
(turtle-context/prefix-map context)
(turtle-context/bnode-number context)))
(define (turtle-context/add-prefix-expansion context name expansion)
(%make-turtle-context (turtle-context/triple-handler context)
(turtle-context/user-state context)
(cons (cons name expansion)
(turtle-context/prefix-map context))
(turtle-context/bnode-number context)))
(define (turtle-context/expand-prefix context name)
(any (if name
(lambda (entry)
(and (string? (car entry))
(string=? (car entry) name)
(cdr entry)))
(lambda (entry)
(and (not (car entry))
(cdr entry))))
(turtle-context/prefix-map context)))
;;;; Turtle Context-Related Parsers
(define-parser turtle:new-anonymous-bnode
(parser:extend (parser:context)
(lambda (context)
(let ((number (turtle-context/bnode-number context)))
(parser:sequence
(parser:set-context (turtle-context/increment-bnode-number context))
(parser:return
(make-rdf-bnode
(string-append "gen" (number->string number #d10)))))))))
(define-parser (turtle:add-triple triple)
(parser:modify-context
(lambda (context)
(turtle-context/add-triple context triple))))
(define-parser (turtle:add-prefix-expansion name expansion)
(parser:modify-context
(lambda (context)
(turtle-context/add-prefix-expansion context name expansion))))
(define-parser (turtle:expand-prefix name)
(parser:call-with-context
(lambda (context)
(turtle-context/expand-prefix context name))))
;;;; Standard URI References
(define rdf-prefix
"http://www.w3.org/1999/02/22-rdf-syntax-ns#")
(define (rdf: suffix)
(string->rdf-uri-ref (string-append rdf-prefix suffix)))
(define rdf:type (rdf: "type"))
(define rdf:first (rdf: "first"))
(define rdf:rest (rdf: "rest"))
(define rdf:nil (rdf: "nil"))
(define xsd-prefix
"http://www.w3.org/2001/XMLSchema#")
(define (xsd: suffix)
(string->rdf-uri-ref (string-append xsd-prefix suffix)))
(define xsd:boolean (xsd: "boolean"))
(define xsd:decimal (xsd: "decimal"))
(define xsd:double (xsd: "double"))
(define xsd:integer (xsd: "integer"))