Posted to tcl by colin at Sun Feb 17 22:10:33 GMT 2013view pretty

#!/usr/bin/env tclsh
package require itcl
package require ylex
package require yeti

#
# Set up the macros and tokenizing rules for CSS
#
# http://www.w3.org/TR/CSS21/grammar.html
#
namespace eval css {

    variable sg [yeti::ylex #auto -name ::css::Scanner]
    $sg configure -case 0

    $sg macro h {[0-9a-f]}
    $sg macro nl {\n|\r\n|\r|\f}
    $sg macro s {[ \t\r\n\f]+}
    $sg macro w {<s>?}
    $sg macro nonascii {[\240-\377]}
    $sg macro unicode {\\<h>{1,6}(\r\n|[ \t\r\n\f])?}
    $sg macro escape {<unicode>|\\[^\r\n\f0-9a-f]}
    $sg macro nmstart {[_a-z]|<nonascii>|<escape>}
    $sg macro nmchar {[_a-z0-9-]|<nonascii>|<escape>}
    $sg macro string1 {\"([^\n\r\f\\"]|\\<nl>|<escape>)*\"}
    $sg macro string2 {\'([^\n\r\f\\']|\\<nl>|<escape>)*\'}
    $sg macro string {<string1>|<string2>}
    $sg macro badstring1 {\"([^\n\r\f\\"]|\\<nl>|<escape>)*\\?}
    $sg macro badstring2 {\'([^\n\r\f\\']|\\<nl>|<escape>)*\\?}
    $sg macro badstring {<badstring1>|<badstring2>}
    $sg macro badcomment1 {\/\*[^*]*\*+([^/*][^*]*\*+)*}
    $sg macro badcomment2 {\/\*[^*]*(\*+[^/*][^*]*)*}
    $sg macro badcomment {<badcomment1>|<badcomment2>}
    $sg macro baduri1 {url\(<w>([!#$%&*-\[\]-~]|<nonascii>|<escape>)*<w>}
    $sg macro baduri2 {url\(<w><string><w>}
    $sg macro baduri3 {url\(<w><badstring>}
    $sg macro baduri {<baduri1>|<baduri2>|<baduri3>}
    $sg macro comment {\/\*[^*]*\*+([^/*][^*]*\*+)*\/}
    $sg macro ident {-?<nmstart><nmchar>*}
    $sg macro name {<nmchar>+}
    $sg macro num {[0-9]+|[0-9]*\.[0-9]+}
    $sg macro url {([!#$%&*-~]|<nonascii>|<escape>)*}
    
    $sg macro A {a|\\0{0,4}(41|61)(\r\n|[ \t\r\n\f])?}
    $sg macro C {c|\\0{0,4}(43|63)(\r\n|[ \t\r\n\f])?}
    $sg macro D {d|\\0{0,4}(44|64)(\r\n|[ \t\r\n\f])?}
    $sg macro E {e|\\0{0,4}(45|65)(\r\n|[ \t\r\n\f])?}
    $sg macro G {g|\\0{0,4}(47|67)(\r\n|[ \t\r\n\f])?|\\g}
    $sg macro H {h|\\0{0,4}(48|68)(\r\n|[ \t\r\n\f])?|\\h}
    $sg macro I {i|\\0{0,4}(49|69)(\r\n|[ \t\r\n\f])?|\\i}
    $sg macro K {k|\\0{0,4}(4b|6b)(\r\n|[ \t\r\n\f])?|\\k}
    $sg macro L {l|\\0{0,4}(4c|6c)(\r\n|[ \t\r\n\f])?|\\l}
    $sg macro M {m|\\0{0,4}(4d|6d)(\r\n|[ \t\r\n\f])?|\\m}
    $sg macro N {n|\\0{0,4}(4e|6e)(\r\n|[ \t\r\n\f])?|\\n}
    $sg macro O {o|\\0{0,4}(4f|6f)(\r\n|[ \t\r\n\f])?|\\o}
    $sg macro P {p|\\0{0,4}(50|70)(\r\n|[ \t\r\n\f])?|\\p}
    $sg macro R {r|\\0{0,4}(52|72)(\r\n|[ \t\r\n\f])?|\\r}
    $sg macro S {s|\\0{0,4}(53|73)(\r\n|[ \t\r\n\f])?|\\s}
    $sg macro T {t|\\0{0,4}(54|74)(\r\n|[ \t\r\n\f])?|\\t}
    $sg macro U {u|\\0{0,4}(55|75)(\r\n|[ \t\r\n\f])?|\\u}
    $sg macro X {x|\\0{0,4}(58|78)(\r\n|[ \t\r\n\f])?|\\x}
    $sg macro Z {z|\\0{0,4}(5a|7a)(\r\n|[ \t\r\n\f])?|\\z}

    $sg add {<s>} {return S;}

    $sg add {\/\*[^*]*\*+([^/*][^*]*\*+)*\/} {}; # ignore comments
    $sg add {<badcomment>} {}; # unclosed comment at EOF

    $sg add {<!--} { return CDO }
    $sg add -- {-->} { return CDC }
    $sg add {~=} { return INCLUDES }
    $sg add {\|=} { return DASHMATCH }

    $sg add {<string>} { return [list STRING $yytext] }
    $sg add {<badstring>} { return BAD_STRING }

    $sg add {<ident>} { return [list IDENT $yytext] }

    $sg add {\#<name>} { return [list HASH $yytext] }

    $sg add {@<I><M><P><O><R><T>} { return IMPORT_SYM }
    $sg add {@<P><A><G><E>} { return PAGE_SYM }
    $sg add {@<M><E><D><I><A>} { return MEDIA_SYM }
    $sg add {@charset } { return CHARSET_SYM }

    $sg add {!(<w>|<comment>)*<I><M><P><O><R><T><A><N><T>} { return [list IMPORTANT_SYM $yytext] }

    $sg add {<num><E><M>} { return [list EMS $yytext] }
    $sg add {<num><E><X>} { return [list EXS $yytext] }
    $sg add {<num><P><X>} { return [list LENGTH $yytext] }
    $sg add {<num><C><M>} { return [list LENGTH $yytext] }
    $sg add {<num><M><M>} { return [list LENGTH $yytext] }
    $sg add {<num><I><N>} { return [list LENGTH $yytext] }
    $sg add {<num><P><T>} { return [list LENGTH $yytext] }
    $sg add {<num><P><C>} { return [list LENGTH $yytext] }
    $sg add {<num><D><E><G>} { return [list ANGLE $yytext] }
    $sg add {<num><R><A><D>} { return [list ANGLE $yytext] }
    $sg add {<num><G><R><A><D>} { return [list ANGLE $yytext] }
    $sg add {<num><M><S>} { return [list TIME $yytext] }
    $sg add {<num><S>} { return [list TIME $yytext] }
    $sg add {<num><H><Z>} { return [list FREQ $yytext] }
    $sg add {<num><K><H><Z>} { return [list FREQ $yytext] }
    $sg add {<num><ident>} { return [list DIMENSION $yytext] }

    $sg add {<num>%} { return [list PERCENTAGE $yytext] }
    $sg add {<num>} { return [list NUMBER $yytext] }

    $sg add {url\(<w><string><w>\)} { return [list URI $yytext] }
    $sg add {url\(<w><url><w>\)} { return [list URI $yytext] }
    $sg add {<baduri>} { return BAD_URI }

    $sg add {(<ident>)\(} { return [list FUNCTION $1] }

    $sg add {\{} { return LBRACE }
    $sg add {\}} { return RBRACE }
    $sg add {\(} { return LPAREN }
    $sg add {\)} { return RPAREN }
    $sg add {\[} { return LBRACKET }
    $sg add {\]} { return RBRACKET }
    $sg add {\*} { return STAR }
    $sg add {;+} { return ";" }

    $sg add {.} { return $yytext }

    #
    # Set up the parsing rules
    #
    variable pg [yeti::yeti #auto -name ::css::Parser]
    $pg add start { stylesheet } { return $1 }


    # stylesheet
    #   : [ CHARSET_SYM STRING ';' ]?
    #     [S|CDO|CDC]* [ import [ CDO S* | CDC S* ]* ]*
    #     [ [ ruleset | media | page ] [ CDO S* | CDC S* ]* ]*
    #   ;
    $pg add {
	stylesheet { CHARSET_SYM STRING ; stylesheet } { return [concat [list charset $2] $4] }
	|          { ruleset stylesheet } { return [concat [list ruleset $1] $2] }
	|          { import stylesheet } { return [concat [list import $1] $2] }
	|          { media stylesheet } { return [list media $1 $2] }
	|          { page stylesheet } { return [concat [list page $1] $2] }
	|          { S* stylesheet } { return $2 }
	|          {} -
    }

    # import
    #   : IMPORT_SYM S*
    #     [STRING|URI] S* media_list? ';' S*
    #   ;
    $pg add {
	import { IMPORT_SYM S* STRING S* media_list ; S* } { return [list $3 $5] }
	|      { IMPORT_SYM S* STRING S* ; S* } { return $3 }
	|      { IMPORT_SYM S* URI S* media_list ; S* } { return [list $3 $5] }
	|      { IMPORT_SYM S* URI S* ; S* } { return $3 }
    }

    # media
    #   : MEDIA_SYM S* media_list '{' S* ruleset* '}' S*
    #   ;
    $pg add media { MEDIA_SYM S* media_list LBRACE S* ruleset_seq RBRACE S* } { return [list $3 $6] }
    
    $pg add {
	ruleset_seq { ruleset ruleset_seq } { return [concat [list ruleset $1] $2] }
	|           { ruleset } { return $1 }
	|           {} -
    }

    # media_list
    #   : medium [ COMMA S* medium]*
    #   ;
    $pg add {
	media_list { medium , S* media_list } { return [concat $1 {*}$4] }
	|          { medium } { return $1 }
    }

    # medium
    #   : IDENT S*
    #   ;
    $pg add medium { IDENT S* } { return $1 }

    # page
    #   : PAGE_SYM S* pseudo_page?
    #     '{' S* declaration? [ ';' S* declaration? ]* '}' S*
    #   ;
    $pg add {
	page { PAGE_SYM S* pseudo_page LBRACE S* declaration_seq RBRACE S* } { return [list $3 $6] }
	|    { PAGE_SYM S* LBRACE S* declaration_seq RBRACE S* } { return [list $2 $5] }
    }

    # pseudo_page
    #   : ':' IDENT S*
    #   ;
    $pg add pseudo_page { : IDENT S* } { return :$2 }
    
    # operator
    #   : '/' S* | ',' S*
    #   ;
    $pg add {
	operator { / S* } { return "/" }
	|        { , S* } { return "," }
	|        { = S* } { return "=" }
    }
    
    # combinator
    #   : '+' S*
    #   | '>' S*
    #   ;
    $pg add {
	combinator { + S* } { return "+" }
	|          { > S* } { return ">" }
    }
    
    # unary_operator
    #   : '-' | '+'
    #   ;
    $pg add {
	unary_operator { - } { return "-" }
	|              { + } { return "+" }
    }
    
    # property
    #   : IDENT S*
    #   ;
    $pg add property { IDENT S* } { return $1 }
    
    # ruleset
    #   : selector [ ',' S* selector ]*
    #     '{' S* declaration? [ ';' S* declaration? ]* '}' S*
    #   ;
    $pg add ruleset { selector_seq S* LBRACE S* declaration_seq RBRACE S* } { return [list $1 $5] }
    
    $pg add {
	selector_seq { selector , S* selector_seq } { return [list $1 {*}$4] }
	|            { selector } { return [list $1] }
	|            {} -
    }
    $pg add {
	declaration_seq { declaration ; S* declaration_seq } { return [list $1 {*}$4] }
	|               { declaration } { return $1 }
	|               {} -
    }
    
    
    # selector
    #   : simple_selector [ combinator selector | S+ [ combinator? selector ]? ]?
    #   ;
    $pg add {
	selector { simple_selector } { return $1 }
	|        { simple_selector combinator selector } { return [concat $1 $2 {*}$3] }
	|        { simple_selector S S* combinator selector } { return [concat $1 $4 {*}$5] }
	|        { simple_selector S S* selector } { return [concat $1 {*}$4] }
	|        {} -
    }
    
    # simple_selector
    #   : element_name [ HASH | class | attrib | pseudo ]*
    #   | [ HASH | class | attrib | pseudo ]+
    #   ;
    $pg add {
	simple_selector { element_name selector_sub } { return [join [list $1 {*}$2] ""] }
	|               { element_name } { return $1 }
	|               { selector_sub } { return $1 }
    }
    $pg add {
	selector_sub { HASH selector_sub } { return [concat $1 {*}$2] }
	|            { class selector_sub } { return [concat $1 {*}$2] }
	|            { attrib selector_sub } { return [concat $1 {*}$2] }
	|            { pseudo selector_sub } { return [concat $1 {*}$2] }
	|            { HASH } { return $1 }
	|            { class } { return $1 }
	|            { attrib } { return $1 }
	|            { pseudo } { return $1 }
    }
    
    # class
    #   : '.' IDENT
    #   ;
    $pg add class { . IDENT } { return .$2 }
    
    # element_name
    #   : IDENT | '*'
    #   ;
    $pg add {
	element_name { IDENT } { return $1 }
	|            { STAR } { return $1 }
    }
    
    # attrib
    #   : '[' S* IDENT S* [ [ '=' | INCLUDES | DASHMATCH ] S*
    #     [ IDENT | STRING ] S* ]? ']'
    #   ;
    $pg add {
	attrib { LBRACKET S* IDENT S* RBRACKET } { return $3 }
	|      { LBRACKET S* IDENT S* = S* IDENT S* RBRACKET } { return [list $3 = $7] }
	|      { LBRACKET S* IDENT S* = S* STRING S* RBRACKET } { return [list $3 = $7] }
	|      { LBRACKET S* IDENT S* INCLUDES S* IDENT S* RBRACKET } { return [list $3 $5 $7] }
	|      { LBRACKET S* IDENT S* INCLUDES S* STRING S* RBRACKET } { return [list $3 $5 $7] }
	|      { LBRACKET S* IDENT S* DASHMATCH S* IDENT S* RBRACKET } { return [list $3 $5 $7] }
	|      { LBRACKET S* IDENT S* DASHMATCH S* STRING S* RBRACKET } { return [list $3 $5 $7] }
    }
    
    # pseudo
    #   : ':' [ IDENT | FUNCTION S* [IDENT S*]? ')' ]
    #   ;
    $pg add {
	pseudo { : IDENT } { return :$2 }
	|      { : FUNCTION S* RPAREN } { return :$2 }
	|      { : FUNCTION S* IDENT S* RPARENT } { return [list :$2 $4] }
    }
    
    # declaration
    #   : property ':' S* expr prio?
    #   ;
    $pg add {
	declaration { property : S* expr } { return [list $1 $4] }
	|           { property : S* expr prio } { return [list $1 $4 $5] }
    }
    
    # prio
    #   : IMPORTANT_SYM S*
    #   ;
    $pg add prio { IMPORTANT_SYM S* } { return $1 }
    
    # expr
    #   : term [ operator? term ]*
    #   ;
    $pg add {
	expr { term } { return $1 }
	|    { term expr } { return [list $1 {*}$2] }
	|    { term operator expr } { return [list $1 $2 {*}$3] }
    }
    
    # term
    #   : unary_operator?
    #     [ NUMBER S* | PERCENTAGE S* | LENGTH S* | EMS S* | EXS S* | ANGLE S* |
    #       TIME S* | FREQ S* ]
    #   | STRING S* | IDENT S* | URI S* | hexcolor | function
    #   ;
    $pg add {
	term { unary_operator term_numeric } { return $1$2 }
	|    { term_numeric } { return $1 }
	|    { STRING S* } { return $1 }
	|    { IDENT S* } { return $1 }
	|    { URI S* } { return $1 }
	|    { hexcolor } { return $1 }
	|    { function } { return $1 }
    }
    
    $pg add {
	term_numeric { NUMBER S* } { return $1 }
	|            { PERCENTAGE S* } { return $1 }
	|            { LENGTH S* } { return $1 }
	|            { EMS S* } { return $1 }
	|            { EXS S* } { return $1 }
	|            { ANGLE S* } { return $1 }
	|            { TIME S* } { return $1 }
	|            { FREQ S* } { return $1 }
    }

    # function
    #   : FUNCTION S* expr ')' S*
    #   ;
    $pg add function { FUNCTION S* expr RPAREN S* } { return [list $1 $3] }
    
    # /*
    #  * There is a constraint on the color that it must
    #  * have either 3 or 6 hex-digits (i.e., [0-9a-fA-F])
    #  * after the "#"; e.g., "#000" is OK, but "#abcd" is not.
    #  */
    # hexcolor
    #   : HASH S*
    #   ;
    $pg add hexcolor { HASH S* } { return $1 }

    $pg add {
	S* { S S* } { return S }
	|  { S } { return S }
	|  {} {}
    }


    #
    # Now, build and eval the scanner and parser
    #
    eval [$sg dump]
    eval [$pg dump]

    #
    # identity - turn parse tree into CSS
    #
    proc identity { css { indent 0 } } {
	set in0 [string repeat " " $indent]
	set in1 [string repeat " " [expr {$indent+2}]]

	foreach { type block } $css {

	    switch $type {
		ruleset {
		    lassign $block name rules
		    puts "$in0[join $name ", "] \{ "
		    foreach rule $rules {
			lassign $rule name value
			puts "$in1$name: $value;"
		    }
		    puts "$in0\}"
		}
		media {
		    lassign $block type rules
		    puts "$in0@media $type \{"
		    ppcss $rules 2
		    puts "$in0\}"
		}

	    }

	}
    }
    Scanner ::css::scanner
    Parser ::css::parser -scanner ::css::scanner

    proc parse {text} {
	scanner start $text
	parser reset
	return [parser parse]
    }

    namespace export -clear *
    namespace ensemble create -subcommands {}
}

#
# And, the test code..
#
if {[info exists argv0] && [file normalize $argv0] eq [file normalize [info script]]} {
    #
    # .. this is a simple pretty-printer for lists
    #
    proc pplist { l { indent 0 } } {

	if { !$indent } {
	    puts "\{"
	    pplist $l [expr {$indent+1}]
	    puts "\}"
	} else {

	    foreach v $l {

		set i [string repeat "  " $indent]

		if { [llength $v] <= 1 } {
		    puts "$i$v"
		} else {
		    puts "$i\{"
		    pplist $v [expr {$indent+1}]
		    puts "$i\}"
		}

	    }

	}

    }

    if {[llength $argv] == 0} {

	puts "Usage:  $argv0 (demo|dump|parse)"
	puts ""
	puts "The \"demo\" mode runs the parser on a small test-case.  The \"dump\""
	puts "mode dumps out the lexer and scanner code.  The \"parse\" mode runs"
	puts "the parser on stdin."

    } else {

	switch [lindex $argv 0] {
	    demo {

		#
		# Demo mode - use a simple built-in test case
		#
		set demo {
		    @charset "utf-8";
		    body, h1 {
			margin: 0px;
			font-family: "URW Palladio L", Palatino, "Times New Roman", Times, serif;
			color: #000;
		    }
		    input {
			background-color: #111;
			filter:alpha(opacity=40);
		    }
		    .button:moop, .button:hover {
			border: solid thin red !important;
		    }
		    .button img { float: left ; }
		    .button img.nopad { padding: 0px; }
		}

		#
		# Run the scanner
		#
		puts [::css parse $demo]

	    }

	    dump {
		#
		# Dump the scanner and lexer code
		#
		puts [::css::Scanner dump]
		puts [::css::Parser dump]
	    }

	    parse {

		#
		# Parse on stdin
		#
		while { ![eof stdin] } {
		    append demo [gets stdin]
		}

		#
		# Run the scanner
		#
		pplist [css parse $demo]

	    }

	}

    }
}