;; lexer.scm: Lexical analyzer for SCSS
;; Copyright (C) 2011 Julian Graham

;; SCSS is free software: you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation, either version 3 of the License, or
;; (at your option) any later version.

;; This program is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;; GNU General Public License for more details.

;; You should have received a copy of the GNU General Public License
;; along with this program.  If not, see <http://www.gnu.org/licenses/>.

#!r6rs

(library (scss lexer)
  (export make-css21-lexer

	  lexical-token-category
	  lexical-token-value)

  (import (rnrs base)
	  (rnrs io simple)
	  (rnrs lists)
	  (rnrs records syntactic)
	  (rnrs unicode)
	  (only (srfi :1) append! last-pair))

  (define-record-type (lexical-token make-lexical-token lexical-token?)
    (fields (immutable category lexical-token-category)
	    (immutable source lexical-token-source)
	    (immutable value lexical-token-value)))
  
  (define-record-type (source-location make-source-location source-location?)
    (fields (immutable input source-location-input)
	    (immutable line source-location-line)
	    (immutable column source-location-column)
	    (immutable offset source-location-offset)
	    (immutable length source-location-length)))
  
  (define (make-css21-lexer strm)
    (define line 0)
    (define num-lines 0)
    (define column 0)
    (define offset 0)  
    (define lines (list))
    
    (define buffer (list))
    
    (define (lexer:read-char)
      (if (null? buffer)
	  (let ((c (read-char strm)))
	    (or (eof-object? c)
		(begin
		  (if (char=? c #\newline)
		      (begin
			(set! line (+ line 1))
			(if (> line num-lines)
			    (begin (if (null? lines)
				       (set! lines (list column))
				       (append! lines (list column)))
				   (set! num-lines (+ num-lines 1))))
			(set! column 0))
		      (set! column (+ column 1)))
		  (set! offset (+ offset 1))))
	    c)
	  (let ((c (car buffer)))
	    (set! buffer (cdr buffer))
	    c)))
    
    (define (lexer:peek-char)
      (if (null? buffer) (peek-char strm) (car buffer)))
    
    (define (lexer:unread-char c)
      (set! offset (- offset 1))
      (if (char? c)
	  (begin 
	    (if (char=? c #\newline)
		(begin (set! line (- line 1))
		       (set! column (list-ref lines line))))
	    (set! buffer (cons c buffer)))))
    
    (define (next-internal? str case-sensitive?)
      (define (unread c) (lexer:unread-char c))
      (define pred? (if case-sensitive? char=? char-ci=?))
      (define (rewind match) (for-each unread match) #f)
      (let loop ((chrs (string->list str))
		 (match (list)))
	(or (null? chrs)
	    (let ((c (lexer:read-char)))
	      (cond ((eof-object? c) (rewind match))
		    ((pred? c (car chrs)) (loop (cdr chrs) (cons c match)))
		    (else (lexer:unread-char c) (rewind match)))))))
    
    (define (next? str) (next-internal? str #t))
    (define (next-ci? str) (next-internal? str #f))
    
    (define (read-number seen-dot?)
      (let ((c (lexer:read-char)))
	(cond ((eof-object? c) '())
	      ((char-numeric? c) (cons c (read-number seen-dot?)))
	      ((char=? c #\.) (if seen-dot? 
				  (begin (lexer:unread-char c) '())
				  (cons c (read-number #t))))
	      (else (lexer:unread-char c) '()))))
    
    (define (consume-whitespace)
      (let ((c (lexer:peek-char)))
	(or (eof-object? c)
	    (and (char-whitespace? c)
		 (lexer:read-char)
		 (consume-whitespace)))))
    
    (define (consume-comments)
      (let ((c (lexer:read-char)))
	(cond ((eof-object? c))
	      ((and (char=? c #\*) (char=? (lexer:peek-char) #\/)) 
	       (lexer:read-char))
	      (else (consume-comments)))))
    
    (define (consume-whitespace-and-comments)
      (let loop ((c (lexer:read-char)))
	(cond ((eof-object? c) #f)
	      ((char-whitespace? c) 
	       (consume-whitespace) 
	       (loop (lexer:read-char)))
	      ((and (char=? c #\/) (char=? (lexer:peek-char) #\*))
	       (lexer:read-char)
	       (consume-comments) 
	       (loop (lexer:read-char)))
	      (else (lexer:unread-char c)))))
    
    (define (read-string type)
      (let ((c (lexer:read-char)))
	(cond ((eof-object? c) '())
	      ((char=? c #\\)
	       (let ((pc (lexer:peek-char)))
		 (cond ((eqv? pc type)
			(lexer:read-char) 
			(cons c (cons type (read-string type))))
		       ((eqv? pc #\newline) 
			(lexer:read-char) 
			(read-string type))
		       (else (cons c (read-string type))))))
	      ((char=? c type) (list type))
	      (else (cons c (read-string type))))))
    
    (define (read-hex)
      (define (read-hex-inner count)
	(let ((c (lexer:peek-char)))
	  (cond ((or (= count 6) (eof-object? c)) '())
		((and (> count 0) (char-whitespace? c)) 
		 (lexer:read-char) (list c))
		((hex? c) 
		 (lexer:read-char) 
		 (cons c (read-hex-inner (+ count 1))))
		(else '()))))
      (let ((h (read-hex-inner 0))) 
	(if (null? h) #f (begin (consume-whitespace) h))))
    
    (define (hex? chr)
      (or (char-numeric? chr) (and (char-ci>=? chr #\A) (char-ci<=? chr #\F))))
    (define (read-escape)
      (let ((c (lexer:peek-char)))
	(if (hex? c) (read-hex) (begin (lexer:read-char) (list c)))))
    
    (define (read-url)
      (define (read-url-inner)
	(let ((c (lexer:read-char)))
	  (cond ((eqv? c #\)) (lexer:unread-char c) '())
		((or (memv c '(#\! #\# #\$ #\% #\&))
		     (and (char-ci>=? c #\*) (char-ci<=? #\~)))
		 (cons c (read-url-inner)))
		((eqv? c #\\) (append (cons c (read-escape))
				      (read-url-inner)))
		(else (lexer:unread-char c) '()))))
      (read-url-inner))
    
    (define (ident-start-char? chr)
      (or (char-alphabetic? chr) 
	  (char=? chr #\_)
	  (> (char->integer chr) 127)))
    (define (ident-rest-char? chr)
      (or (ident-start-char? chr)
	  (char=? chr #\-)
	  (char-numeric? chr)))
    
    (define (read-ident c)
      (define (read-start c)
	(cond ((eof-object? c) '())
	      ((ident-start-char? c) (list c))
	      ((char=? c #\\) (append (cons c (read-escape))))
	      (else '())))
      (define (read-rest)
	(let ((c (lexer:read-char)))
	  (cond ((eof-object? c) '())
		((ident-rest-char? c) (cons c (read-rest)))
		((char=? c #\\) (append (cons c (read-escape)) 
					(read-rest)))
		(else (lexer:unread-char c) '()))))
      (if (and (not (eof-object? c)) (char=? c #\-))
	  (let* ((cc (lexer:read-char))
		 (s (read-start cc)))
	    (if (null? s)
		(begin (lexer:unread-char cc) #f)
		(cons c (append s (read-rest)))))
	  (let ((s (read-start c)))
	    (if (null? s) #f (append s (read-rest))))))
    
    (define (make-token t)
      (let ((ct (cdr t)))
	(make-lexical-token 
	 (car t) 
	 (make-source-location "foo" line column offset (string-length ct))
	 ct)))
    
    (lambda ()
      (let ((c (lexer:read-char)))
	(cond ((eof-object? c) '*eoi*)
	      ((char-whitespace? c) 
	       (consume-whitespace)
	       (consume-whitespace-and-comments)
	       (make-token '(S . " ")))
	      ((and (eqv? c #\<) (next? "!--")) (make-token '(CD . "!--")))
	      ((and (eqv? c #\-) (next? "->")) (make-token '(CD . "->")))
	      ((and (eqv? c #\~) (next? "=")) (make-token '(INCLUDES . "~=")))
	      ((and (eqv? c #\|) (next? "=")) (make-token '(DASHMATCH . "|=")))
	      ((eqv? c #\*) (make-token '(STAR . "*")))
	      
	      ((or (and (eqv? c #\.) (char-numeric? (lexer:peek-char)))
		   (and (char-numeric? c)))
	       (let* ((n (read-number (eqv? c #\.)))
		      (o (list->string (cons c n))))
		 (cond 
		  ((next-ci? "em") 
		   (make-token (cons 'EMS (string-append o "em"))))
		  ((next-ci? "ex") 
		   (make-token (cons 'EXS (string-append o "ex"))))
		  ((next-ci? "px") 
		   (make-token (cons 'LENGTH (string-append o "px"))))
		  ((next-ci? "cm") 
		   (make-token (cons 'LENGTH (string-append o "cm"))))
		  ((next-ci? "mm") 
		   (make-token (cons 'LENGTH (string-append o "mm")))) 
		  ((next-ci? "in") 
		   (make-token (cons 'LENGTH (string-append o "in"))))
		  ((next-ci? "pt") 
		   (make-token (cons 'LENGTH (string-append o "pt"))))
		  ((next-ci? "pc") 
		   (make-token (cons 'LENGTH (string-append o "pc"))))
		  ((next-ci? "deg") 
		   (make-token (cons 'ANGLE (string-append o "deg"))))
		  ((next-ci? "rad") 
		   (make-token (cons 'ANGLE (string-append o "rad")))) 
		  ((next-ci? "grad") 
		   (make-token (cons 'ANGLE (string-append o "grad"))))
		  ((next-ci? "ms") 
		   (make-token (cons 'TIME (string-append o "ms"))))
		  ((next-ci? "s") 
		   (make-token (cons 'TIME (string-append o "s"))))
		  ((next-ci? "hz") 
		   (make-token (cons 'FREQUENCY (string-append o "hz"))))
		  ((next-ci? "khz")
		   (make-token (cons 'FREQUENCY (string-append o "khz"))))
		  ((next-ci? "%") 
		   (make-token (cons 'PERCENTAGE (string-append o "%"))))
		  (else (let ((c (lexer:read-char)))
			  (or (let ((cc (read-ident c)))
				(and cc 
				     (make-token (cons 'DIMENSION 
						       (string-append 
							o (list->string cc))))))
			      (begin (lexer:unread-char c)
				     (make-token (cons 'NUMBER o)))))))))
	      ((eqv? c #\.) (make-token '(DOT . ".")))
	      ((eqv? c #\;) (make-token '(SEMICOLON . ";")))
	      ((eqv? c #\:) (make-token '(COLON . ":")))
	      ((eqv? c #\=) (make-token '(EQUALS . "=")))
	      ((eqv? c #\[) (make-token '(LBRACK . "[")))
	      ((eqv? c #\]) (make-token '(RBRACK . "]")))
	      ((eqv? c #\)) (make-token '(RPAREN . ")")))
	      ((eqv? c #\}) (make-token '(RBRACE . "}")))
	      ((eqv? c #\{) (make-token '(LBRACE . "{")))
	      ((eqv? c #\+) (make-token '(PLUS . "+")))
	      ((eqv? c #\>) (make-token '(GREATER . ">")))
	      ((eqv? c #\,) (make-token '(COMMA . ",")))
	      ((eqv? c #\/) 
	       (let ((cc (lexer:peek-char)))
		 (if (and (char? cc) (char=? cc #\*))
		     (begin (lexer:read-char)
			    (consume-comments)
			    (make-token '(S . " ")))
		     (make-token '(DIV . "/")))))
	      ((eqv? c #\%) (make-token '(PERCENT . "%")))
	      ((or (eqv? c #\') (eqv? c #\")) 
	       (let ((s (read-string c)))
		 (if (or (null? s) (not (char=? (car (last-pair s)) c)))
		     (make-token (cons 'INVALID (list->string (cons c s))))
		     (make-token (cons 'STRING (list->string (cons c s)))))))
	      
	      ;; Identifiers can start with a hyphen, as long as it's not 
	      ;; followed by a digit.
	      
	      ((read-ident c) => 
	       (lambda (ident)
		 (let ((i (list->string ident))
		       (cc (lexer:peek-char)))
		   (cond ((and (not (eof-object? cc)) (char=? cc #\())
			  (begin (lexer:read-char)
				 (if (string-ci=? i "url")
				     (begin
				       (consume-whitespace)
				       (let ((r 
					      (let ((c (lexer:read-char)))
						(if (or (eqv? c #\') 
							(eqv? c #\"))
						    (list->string 
						     (cons c (read-string c)))
						    (begin
						      (lexer:unread-char c)
						      (list->string 
						       (read-url)))))))
					 (consume-whitespace)
					 (lexer:read-char)
					 (make-token 
					  (cons 'URI 
						(string-append "url(" r ")")))))
				     (make-token 
				      (cons 'FUNCTION (string-append i "("))))))
			 (else (make-token (cons 'IDENT i)))))))
	      ((eqv? c #\-) (make-token '(MINUS . "-")))
	      ((eqv? c #\#)
	       (let ((ident (read-ident (lexer:read-char))))
		 (if ident
		     (if (and (<= (length ident) 6) (for-all hex? ident))
			 (make-token (cons 'HEX (list->string ident)))
			 (make-token (cons 'IDSEL (list->string ident))))
		     (or (let ((cc (read-hex)))
			   (and cc (make-token (cons 'HEX (list->string cc)))))
			 (make-token '(HASH . "#"))))))
	      ((eqv? c #\@)
	       (cond ((next-ci? "import") (make-token '(IMPORT_SYM . "import")))
		     ((next-ci? "page") (make-token '(PAGE_SYM . "page")))
		     ((next-ci? "media") (make-token '(MEDIA_SYM . "media")))
		     ((next-ci? "charset") 
		      (make-token '(CHARSET_SYM . "charset")))
		     (else (make-token '(BAD_AT_KEYWORD . "@")))))
	      
	      ((eqv? c #\!)
	       (consume-whitespace-and-comments)
	       (if (next? "important") 
		   (make-token '(IMPORTANT_SYM . "!important"))
		   (make-token (cons 'DELIM (string c)))))
	      (else (make-token (cons 'DELIM (string c))))))))
)
