;; parser.scm: custom SXML parser implementation for SDOM
;; Copyright (C) 2010 Julian Graham

;; SDOM free software: you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation, either version 3 of the License, or
;; (at your option) any later version.

;; SDOM is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;; GNU General Public License for more details.

;; You should have received a copy of the GNU General Public License
;; along with SDOM.  If not, see <http://www.gnu.org/licenses/>.

#!r6rs

(library (sdom parser (0 5))
  (export sdom:xml-parser)
  (import (rnrs) 
	  (srfi :1) 
	  (srfi :13) 
	  (sxml ssax)
	  (sdom common))

  (define input-parse:init-buffer
    (let ((buffer (make-string 512)))
      (lambda () buffer))) 

  (define (next-token-of incl-list/pred port)
    (let* ((buffer (input-parse:init-buffer))
	   (curr-buf-len (string-length buffer)))
      (if (procedure? incl-list/pred)
	  (let outer ((buffer buffer) (filled-buffer-l '()))
	    (let loop ((i 0))
	      (if (>= i curr-buf-len)		; make sure we have space
		  (outer (make-string curr-buf-len) 
			 (cons buffer filled-buffer-l))
		  (let ((c (incl-list/pred (peek-char port))))
		    (if c
			(begin
			  (string-set! buffer i c)
			  (read-char port)
			  (loop (+ i 1)))
			(if (null? filled-buffer-l) (substring buffer 0 i)
			    (string-concatenate-reverse 
			     filled-buffer-l buffer i)))))))
	  
	  (let outer ((buffer buffer) (filled-buffer-l '()))
	    (let loop ((i 0))
	      (if (>= i curr-buf-len)
		  (outer (make-string curr-buf-len) 
			 (cons buffer filled-buffer-l))
		  (let ((c (peek-char port)))
		    (cond
		     ((not (memv c incl-list/pred))
		      (if (null? filled-buffer-l) (substring buffer 0 i)
			  (string-concatenate-reverse 
			   filled-buffer-l buffer i)))
		     (else
		      (string-set! buffer i c)
		      (read-char port)
		      (loop (+ i 1))))))))
	  )))

  (define (ssax:ncname-starting-char? a-char)
    (and (char? a-char)
	 (or (char-alphabetic? a-char)
	     (char=? #\_ a-char))))

  (define (ssax:read-NCName port)
    (let ((first-char (peek-char port)))
      (or (ssax:ncname-starting-char? first-char)
	  (parser-error port "XMLNS [4] for '" first-char "'")))
    (string->symbol
     (next-token-of
      (lambda (c)
        (cond
	 ((eof-object? c) #f)
	 ((char-alphabetic? c) c)
	 ((string-index "0123456789.-_" c) c)
	 (else #f)))
      port)))

  (define (skip-while skip-chars port)
    (do ((c (peek-char port) (peek-char port)))
	((not (memv c skip-chars)) c)
      (read-char port)))

  (define ssax:S-chars (map integer->char '(32 10 9 13)))

  (define (ssax:skip-S port) (skip-while ssax:S-chars port))

  ;; This is to override the definition in SXML, which discards, for some 
  ;; reason, the publid ID.
  
  (define (ssax:read-external-id p)
    (let ((discriminator (ssax:read-NCName p)))
      (assert-curr-char ssax:S-chars "space after SYSTEM or PUBLIC" p)
      (ssax:skip-S p)
      (let ((delim (assert-curr-char '(#\' #\" ) "XML [11], XML [12]" p)))
	(cond
	 ((eq? discriminator (string->symbol "SYSTEM"))
	  (begin0 (next-token '() (list delim) "XML [11]" p)
		  (read-char p))) ; reading the closing delim
	 ((eq? discriminator (string->symbol "PUBLIC"))
	  (let ((publicid (next-token '() (list delim) "XML [11]" p)))
	    (read-char p)
	    (assert-curr-char ssax:S-chars "space after PubidLiteral" p)
	    (ssax:skip-S p)
	    (let* ((delim (assert-curr-char '(#\' #\" ) "XML [11]" p))
		   (systemid (next-token '() (list delim) "XML [11]" p)))
	      (read-char p) ; reading the closing delim
	      (cons publicid systemid))))
	 (else (parser-error (current-output-port) "XML [75], " discriminator 
			     " rather than SYSTEM or PUBLIC"))))))

  (define entity-parse-prefix "<?xml version=\"1.0\"?><sdom-entity>")
  (define entity-parse-suffix "</sdom-entity>")
  
  (define (sdom:xml-parser port namespace-prefix-assig)
    (letrec ((ns (map (lambda (el)
			(cons* #f (car el) (ssax:uri-string->symbol (cdr el))))
		      namespace-prefix-assig))
	     (RES-NAME->SXML (lambda (res-name)
			       (string->symbol
				(string-append
				 (if (eq? (car res-name) 'xml)
				     xml-ns-uri
				     (symbol->string (car res-name)))
				 ":"
				 (symbol->string (cdr res-name))))))
	     
; This is a teensy bit of trickery -- to properly create the
; entity definition, we have to run its contents through the
; parser and then append the expanded structure to a new entity
; node, which is typically read-only. So we temporarily make SDOM 
; think it's actually an element node.

	     (post-parse-entity 
	      (lambda (doc entdef)
		(let* ((newdoc (sdom:xml->sdom 
				(open-string-input-port
				 (string-append entity-parse-prefix
						(cdr entdef)
						entity-parse-suffix))))
		       (ent (list '*ENTITY* (car entdef)))
		       (elts (sdom:child-nodes (sdom:document-element newdoc)))
		       (len (length elts)))
		  (for-each (lambda (x)
			      (adopt-node! doc x) 
			      (insert-child! ent x len))
			    elts)
		  ent)))
	     
	     (parse-dtd 
	      (lambda (p ents)
		(letrec ((rt (lambda (p c lst)
			       (let ((c1 (read-char p)))
				 (cond ((eqv? c c1) lst)
				       ((eof-object? c1) 
					(error "unexpected eof"))
				       (else (rt 
					      p c (append lst (list c1))))))))
			 (skip-until (lambda (lst p)
				       (let ((c (peek-char p)))
					 (if (or (memv c lst) (eof-object? c)) 
					     c 
					     (begin (read-char p) 
						    (skip-until lst p))))))) 
		  (if (eof-object? (skip-until (list #\< '*eof*) p))
		      (list ents)
		      (let ((tok (ssax:read-markup-token p)))
			(cond ((eq? (cdr tok) 'ENTITY)
			       (ssax:skip-S p)
			       (parse-dtd 
				p (cons (cons (ssax:read-NCName p)
					      (list->string 
					       (rt p 
						   (begin
						     (skip-until (list #\" #\') 
								 p)
						     (read-char p))
						   (list))))
					ents)))
			      (else (parse-dtd p ents)))))))))
      (let ((result
	     (reverse
	      ((ssax:make-parser
		NEW-LEVEL-SEED 
		(lambda (elem-gi attributes ns expected-content seed) '())
		
		FINISH-ELEMENT
		(lambda (elem-gi attributes ns parent-seed seed)
		  (define (find-prefix uri)
		    (let ((p (find (lambda (ns-decl) (eq? uri (cadr ns-decl)))
				   ns)))
		      (and p (car p))))
				   
		  (let* ((orig-prefix 
			  (and (pair? elem-gi)
			       (find-prefix (car elem-gi))))
			 (elem-gi
			  (if (symbol? elem-gi)
			      elem-gi
			      (cond ((not orig-prefix) elem-gi)
				    ((eq? orig-prefix '*DEFAULT*) elem-gi)
				    (else (cdr elem-gi))))))
		  
;		  (display "elem-gi: ") (display elem-gi) (newline)
;		  (display "attributes: ") (display attributes) (newline)
;		  (display "ns: ") (display ns) (newline)
;		  (display "parent seed: ") (display parent-seed) (newline)
;		  (display "seed: ") (display seed) (newline)
;		  (newline)

		  (let* ((seed (ssax:reverse-collect-str-drop-ws seed))
			 (attrs 
			  (attlist-fold
			   (lambda (attr accum)
			     (cons (append (list
					    (if (symbol? (car attr))
						(car attr)
						(RES-NAME->SXML 
						 (car attr)))
					    (cdr attr))
					   (let ((orig-attr-prefix
						  (and (pair? (car attr))
						       (find-prefix (caar attr)))))
					     (if (and orig-attr-prefix (not (eq? orig-attr-prefix '*DEFAULT*)))
						 `((^ (*SDOM:ORIG-PREFIX*
						       ,(symbol->string orig-attr-prefix))))
						 '())))
				   accum))
			   '()
			   attributes))
			 (attrs (if (and orig-prefix 
					 (not (eq? orig-prefix '*DEFAULT*)))
				    (cons `(*SDOM:ORIG-PREFIX* 
					    ,(symbol->string orig-prefix))
					  attrs) 
				    attrs)))
		    (cons (cons (if (symbol? elem-gi) elem-gi
				    (RES-NAME->SXML elem-gi))
				(if (null? attrs) seed
				    (cons (cons '^ attrs) seed)))
			  parent-seed))))
		
		CHAR-DATA-HANDLER
		(lambda (string1 string2 seed)
		  (if (string-null? string2) 
		      (cons string1 seed)
		      (cons* string2 string1 seed)))
		
		DOCTYPE
		(lambda (port docname systemid internal-subset? seed)
		  (let* ((s (if internal-subset?
				(letrec 
				    ((f (lambda (y)
					  (let ((z (read-char port)))
					    (if (and (eqv? (car (last-pair y)) 
							   #\]) 
						     (eqv? z #\>))
						y
						(f (append y (list z))))))))
				  (list->string (f (list (read-char port)))))
				'()))
			 (sdata (and (not (null? s))
				     (parse-dtd (open-string-input-port s) 
						(list))))
			 (dtd (list '*DOCTYPE*)))

		    (values
		     #f (if (null? sdata) '() (car sdata)) ns 
		     (cons (list '*DOCTYPE*
				 (symbol->string (if (pair? docname) 
						     (car docname) 
						     docname))
				 (and (not (null? sdata))
				      (map (lambda (ent) 
					     `(*ENTITY* ,(car ent) ,(cdr ent)))
					   (car sdata)))
				 #f
				 (and (list? systemid) (car systemid))
				 (and (list? systemid) (cdr systemid))
				 sdata)
			   seed))))
		
		UNDECL-ROOT
		(lambda (elem-gi seed) (values #f '() ns seed))
		
		PI
		((*DEFAULT* .
			    (lambda (port pi-tag seed)
			      (cons (list '*PI* 
					  pi-tag 
					  (ssax:read-pi-body-as-string port))
				    seed)))))
	       
	       port '()))))
	(cons '*TOP*
	      (if (null? namespace-prefix-assig) result
		  (cons (list '^ (cons '*NAMESPACES* 
				       (map (lambda (ns) 
					      (list (car ns) (cdr ns)))
					    namespace-prefix-assig)))
			result))))))
)