Posted to tcl by colin at Tue Feb 19 07:14:08 GMT 2013view pretty

#!/usr/bin/env tclsh
#lappend ::auto_path [pwd]
package require itcl
package require ylex
package require yeti

#
# Set up the macros and tokenizing rules for CSS
#
# http://www.w3.org/TR/CSS21/grammar.html
#
namespace eval css {

    proc MkScanner {} {
	variable sg [yeti::ylex #auto -name ::css::Scanner]
	$sg configure -case 0

	$sg macro h {[0-9a-f]}
	$sg macro nl {\n|\r\n|\r|\f}
	$sg macro s {[ \t\r\n\f]+}
	$sg macro w {<s>?}
	$sg macro nonascii {[\240-\377]}
	$sg macro unicode {\\<h>{1,6}(\r\n|[ \t\r\n\f])?}
	$sg macro escape {<unicode>|\\[^\r\n\f0-9a-f]}
	$sg macro nmstart {[_a-z]|<nonascii>|<escape>}
	$sg macro nmchar {[_a-z0-9-]|<nonascii>|<escape>}
	$sg macro ident {-?<nmstart><nmchar>*}
	$sg macro string1 {\"([^\n\r\f\\"]|\\<nl>|<escape>)*\"}
    $sg macro string2 {\'([^\n\r\f\\']|\\<nl>|<escape>)*\'}
    $sg macro string {<string1>|<string2>}
    $sg macro badstring1 {\"([^\n\r\f\\"]|\\<nl>|<escape>)*\\?}
	$sg macro badstring2 {\'([^\n\r\f\\']|\\<nl>|<escape>)*\\?}
	$sg macro badstring {<badstring1>|<badstring2>}
	$sg macro badcomment1 {\/\*[^*]*\*+([^/*][^*]*\*+)*}
	$sg macro badcomment2 {\/\*[^*]*(\*+[^/*][^*]*)*}
	$sg macro badcomment {<badcomment1>|<badcomment2>}
	$sg macro baduri1 {url\(<w>([!#$%&*-\[\]-~]|<nonascii>|<escape>)*<w>}
	$sg macro baduri2 {url\(<w><string><w>}
	$sg macro baduri3 {url\(<w><badstring>}
	$sg macro baduri {<baduri1>|<baduri2>|<baduri3>}
	$sg macro comment {\/\*[^*]*\*+([^/*][^*]*\*+)*\/}
	$sg macro name {<nmchar>+}
	$sg macro num {[0-9]+|[0-9]*\.[0-9]+}
	$sg macro url {([!#$%&*-~]|<nonascii>|<escape>)*}
	$sg macro meta {[$]<ident>}

	$sg macro A {a|\\0{0,4}(41|61)(\r\n|[ \t\r\n\f])?}
	$sg macro C {c|\\0{0,4}(43|63)(\r\n|[ \t\r\n\f])?}
	$sg macro D {d|\\0{0,4}(44|64)(\r\n|[ \t\r\n\f])?}
	$sg macro E {e|\\0{0,4}(45|65)(\r\n|[ \t\r\n\f])?}
	$sg macro G {g|\\0{0,4}(47|67)(\r\n|[ \t\r\n\f])?|\\g}
	$sg macro H {h|\\0{0,4}(48|68)(\r\n|[ \t\r\n\f])?|\\h}
	$sg macro I {i|\\0{0,4}(49|69)(\r\n|[ \t\r\n\f])?|\\i}
	$sg macro K {k|\\0{0,4}(4b|6b)(\r\n|[ \t\r\n\f])?|\\k}
	$sg macro L {l|\\0{0,4}(4c|6c)(\r\n|[ \t\r\n\f])?|\\l}
	$sg macro M {m|\\0{0,4}(4d|6d)(\r\n|[ \t\r\n\f])?|\\m}
	$sg macro N {n|\\0{0,4}(4e|6e)(\r\n|[ \t\r\n\f])?|\\n}
	$sg macro O {o|\\0{0,4}(4f|6f)(\r\n|[ \t\r\n\f])?|\\o}
	$sg macro P {p|\\0{0,4}(50|70)(\r\n|[ \t\r\n\f])?|\\p}
	$sg macro R {r|\\0{0,4}(52|72)(\r\n|[ \t\r\n\f])?|\\r}
	$sg macro S {s|\\0{0,4}(53|73)(\r\n|[ \t\r\n\f])?|\\s}
	$sg macro T {t|\\0{0,4}(54|74)(\r\n|[ \t\r\n\f])?|\\t}
	$sg macro U {u|\\0{0,4}(55|75)(\r\n|[ \t\r\n\f])?|\\u}
	$sg macro X {x|\\0{0,4}(58|78)(\r\n|[ \t\r\n\f])?|\\x}
	$sg macro Z {z|\\0{0,4}(5a|7a)(\r\n|[ \t\r\n\f])?|\\z}

	$sg add {<s>} {return S;}

	$sg add {\/\*[^*]*\*+([^/*][^*]*\*+)*\/} {}; # ignore comments
	$sg add {<badcomment>} {}; # unclosed comment at EOF

	$sg add {<!--} { return CDO }
	$sg add -- {-->} { return CDC }
	$sg add {~=} { return INCLUDES }
	$sg add {\|=} { return DASHMATCH }

	$sg add {<string>} { return [list STRING $yytext] }
	$sg add {<badstring>} { return BAD_STRING }

	$sg add {<ident>} { return [list IDENT $yytext] }

	$sg add {\#<name>} { return [list HASH $yytext] }

	$sg add {@<I><M><P><O><R><T>} { return IMPORT_SYM }
	$sg add {@<P><A><G><E>} { return PAGE_SYM }
	$sg add {@<M><E><D><I><A>} { return MEDIA_SYM }
	$sg add {@charset } { return CHARSET_SYM }

	$sg add {!(<w>|<comment>)*<I><M><P><O><R><T><A><N><T>} { return [list IMPORTANT_SYM $yytext] }

	$sg add {<num><E><M>} { return [list EMS $yytext] }
	$sg add {<num><E><X>} { return [list EXS $yytext] }
	$sg add {<num><P><X>} { return [list LENGTH $yytext] }
	$sg add {<num><C><M>} { return [list LENGTH $yytext] }
	$sg add {<num><M><M>} { return [list LENGTH $yytext] }
	$sg add {<num><I><N>} { return [list LENGTH $yytext] }
	$sg add {<num><P><T>} { return [list LENGTH $yytext] }
	$sg add {<num><P><C>} { return [list LENGTH $yytext] }
	$sg add {<num><D><E><G>} { return [list ANGLE $yytext] }
	$sg add {<num><R><A><D>} { return [list ANGLE $yytext] }
	$sg add {<num><G><R><A><D>} { return [list ANGLE $yytext] }
	$sg add {<num><M><S>} { return [list TIME $yytext] }
	$sg add {<num><S>} { return [list TIME $yytext] }
	$sg add {<num><H><Z>} { return [list FREQ $yytext] }
	$sg add {<num><K><H><Z>} { return [list FREQ $yytext] }
	$sg add {<num><ident>} { return [list DIMENSION $yytext] }

	$sg add {<num>%} { return [list PERCENTAGE $yytext] }
	$sg add {<num>} { return [list NUMBER $yytext] }

	$sg add {url\(<w><string><w>\)} { return [list URI $yytext] }
	$sg add {url\(<w><url><w>\)} { return [list URI $yytext] }
	$sg add {<baduri>} { return BAD_URI }

	$sg add {(<ident>)\(} { return [list FUNCTION $1] }

	$sg add {\{} { return LBRACE }
	$sg add {\}} { return RBRACE }
	$sg add {\(} { return LPAREN }
	$sg add {\)} { return RPAREN }
	$sg add {\[} { return LBRACKET }
	$sg add {\]} { return RBRACKET }
	$sg add {\*} { return STAR }
	$sg add {;+} { return ";" }

	$sg add {.} { return $yytext }
    }

    proc MkParser {} {
	#
	# Set up the parsing rules
	#
	variable pg [yeti::yeti #auto -name ::css::Parser]
	$pg add start { stylesheet } { return $1 }


	# stylesheet
	#   : [ CHARSET_SYM STRING ';' ]?
	#     [S|CDO|CDC]* [ import [ CDO S* | CDC S* ]* ]*
	#     [ [ ruleset | media | page ] [ CDO S* | CDC S* ]* ]*
	#   ;
	$pg add {
	    stylesheet { CHARSET_SYM STRING ; stylesheet } { return [concat [list charset $2] $4] }
	    |          { ruleset stylesheet } { return [concat [list ruleset $1] $2] }
	    |          { import stylesheet } { return [concat [list import $1] $2] }
	    |          { media stylesheet } { return [list media $1 $2] }
	    |          { page stylesheet } { return [concat [list page $1] $2] }
	    |          { S* stylesheet } { return $2 }
	    |          {} -
	}

	# import
	#   : IMPORT_SYM S*
	#     [STRING|URI] S* media_list? ';' S*
	#   ;
	$pg add {
	    import { IMPORT_SYM S* STRING S* media_list ; S* } { return [list $3 $5] }
	    |      { IMPORT_SYM S* STRING S* ; S* } { return $3 }
	    |      { IMPORT_SYM S* URI S* media_list ; S* } { return [list $3 $5] }
	    |      { IMPORT_SYM S* URI S* ; S* } { return $3 }
	}
	
	# media
	#   : MEDIA_SYM S* media_list '{' S* ruleset* '}' S*
	#   ;
	$pg add media { MEDIA_SYM S* media_list LBRACE S* ruleset_seq RBRACE S* } { return [list $3 $6] }
    
	$pg add {
	    ruleset_seq { ruleset ruleset_seq } { return [concat [list ruleset $1] $2] }
	    |           { ruleset } { return $1 }
	    |           {} -
	}

	# media_list
	#   : medium [ COMMA S* medium]*
	#   ;
	$pg add {
	    media_list { medium , S* media_list } { return [concat $1 {*}$4] }
	    |          { medium } { return $1 }
	}

	# medium
	#   : IDENT S*
	#   ;
	$pg add medium { IDENT S* } { return $1 }
	
	# page
	#   : PAGE_SYM S* pseudo_page?
	#     '{' S* declaration? [ ';' S* declaration? ]* '}' S*
	#   ;
	$pg add {
	    page { PAGE_SYM S* pseudo_page LBRACE S* declaration_seq RBRACE S* } { return [list $3 $6] }
	    |    { PAGE_SYM S* LBRACE S* declaration_seq RBRACE S* } { return [list $2 $5] }
	}

	# pseudo_page
	#   : ':' IDENT S*
	#   ;
	$pg add pseudo_page { : IDENT S* } { return :$2 }
    
	# operator
	#   : '/' S* | ',' S*
	#   ;
	$pg add {
	    operator { / S* } { return "/" }
	    |        { , S* } { return "," }
	    |        { = S* } { return "=" }
	}
    
	# combinator
	#   : '+' S*
	#   | '>' S*
	#   ;
	$pg add {
	    combinator { + S* } { return "+" }
	    |          { > S* } { return ">" }
	}
	
	# unary_operator
	#   : '-' | '+'
	#   ;
	$pg add {
	    unary_operator { - } { return "-" }
	    |              { + } { return "+" }
	}
	
	# property
	#   : IDENT S*
	#   ;
	$pg add property { IDENT S* } { return $1 }
	
	# ruleset
	#   : selector [ ',' S* selector ]*
	#     '{' S* declaration? [ ';' S* declaration? ]* '}' S*
	#   ;
	$pg add ruleset { selector_seq S* LBRACE S* declaration_seq RBRACE S* } { return [list $1 $5] }
    
	$pg add {
	    selector_seq { selector , S* selector_seq } { return [list $1 {*}$4] }
	    |            { selector } { return [list $1] }
	    |            {} -
	}
	$pg add {
	    declaration_seq { declaration ; S* declaration_seq } { return [list $1 {*}$4] }
	    |               { declaration } { return $1 }
	    |               {} -
	}
    
	
	# selector
	#   : simple_selector [ combinator selector | S+ [ combinator? selector ]? ]?
	#   ;
	$pg add {
	    selector { simple_selector } { return $1 }
	    |        { simple_selector combinator selector } { return [concat $1 $2 {*}$3] }
	    |        { simple_selector S S* combinator selector } { return [concat $1 $4 {*}$5] }
	    |        { simple_selector S S* selector } { return [concat $1 {*}$4] }
	    |        {} -
	}
    
	# simple_selector
	#   : element_name [ HASH | class | attrib | pseudo ]*
	#   | [ HASH | class | attrib | pseudo ]+
	#   ;
	$pg add {
	    simple_selector { element_name selector_sub } { return [join [list $1 {*}$2] ""] }
	    |               { element_name } { return $1 }
	    |               { selector_sub } { return $1 }
	}
	$pg add {
	    selector_sub { HASH selector_sub } { return [concat $1 {*}$2] }
	    |            { class selector_sub } { return [concat $1 {*}$2] }
	    |            { attrib selector_sub } { return [concat $1 {*}$2] }
	    |            { pseudo selector_sub } { return [concat $1 {*}$2] }
	    |            { HASH } { return $1 }
	    |            { class } { return $1 }
	    |            { attrib } { return $1 }
	    |            { pseudo } { return $1 }
	}
	
	# class
	#   : '.' IDENT
	#   ;
	$pg add class { . IDENT } { return .$2 }
	
	# element_name
	#   : IDENT | '*'
	#   ;
	$pg add {
	    element_name { IDENT } { return $1 }
	    |            { STAR } { return $1 }
	}
    
	# attrib
	#   : '[' S* IDENT S* [ [ '=' | INCLUDES | DASHMATCH ] S*
	#     [ IDENT | STRING ] S* ]? ']'
	#   ;
	$pg add {
	    attrib { LBRACKET S* IDENT S* RBRACKET } { return $3 }
	    |      { LBRACKET S* IDENT S* = S* IDENT S* RBRACKET } { return [list $3 = $7] }
	    |      { LBRACKET S* IDENT S* = S* STRING S* RBRACKET } { return [list $3 = $7] }
	    |      { LBRACKET S* IDENT S* INCLUDES S* IDENT S* RBRACKET } { return [list $3 $5 $7] }
	    |      { LBRACKET S* IDENT S* INCLUDES S* STRING S* RBRACKET } { return [list $3 $5 $7] }
	    |      { LBRACKET S* IDENT S* DASHMATCH S* IDENT S* RBRACKET } { return [list $3 $5 $7] }
	    |      { LBRACKET S* IDENT S* DASHMATCH S* STRING S* RBRACKET } { return [list $3 $5 $7] }
	}
	
	# pseudo
	#   : ':' [ IDENT | FUNCTION S* [IDENT S*]? ')' ]
	#   ;
	$pg add {
	    pseudo { : IDENT } { return :$2 }
	    |      { : FUNCTION S* RPAREN } { return :$2 }
	    |      { : FUNCTION S* IDENT S* RPARENT } { return [list :$2 $4] }
	}
    
	# declaration
	#   : property ':' S* expr prio?
	#   ;
	$pg add {
	    declaration { property : S* expr } { return [list $1 $4] }
	    |           { property : S* expr prio } { return [list $1 $4 $5] }
	}
    
	# prio
	#   : IMPORTANT_SYM S*
	#   ;
	$pg add prio { IMPORTANT_SYM S* } { return $1 }
	
	# expr
	#   : term [ operator? term ]*
	#   ;
	$pg add {
	    expr { term } { return $1 }
	    |    { term expr } { return [list $1 {*}$2] }
	    |    { term operator expr } { return [list $1 $2 {*}$3] }
	}
	
	# term
	#   : unary_operator?
	#     [ NUMBER S* | PERCENTAGE S* | LENGTH S* | EMS S* | EXS S* | ANGLE S* |
	#       TIME S* | FREQ S* ]
	#   | STRING S* | IDENT S* | URI S* | hexcolor | function
	#   ;
	$pg add {
	    term { unary_operator term_numeric } { return $1$2 }
	    |    { term_numeric } { return $1 }
	    |    { STRING S* } { return $1 }
	    |    { IDENT S* } { return $1 }
	    |    { URI S* } { return $1 }
	    |    { hexcolor } { return $1 }
	    |    { function } { return $1 }
	}
	
	$pg add {
	    term_numeric { NUMBER S* } { return $1 }
	    |            { PERCENTAGE S* } { return $1 }
	    |            { LENGTH S* } { return $1 }
	    |            { EMS S* } { return $1 }
	    |            { EXS S* } { return $1 }
	    |            { ANGLE S* } { return $1 }
	    |            { TIME S* } { return $1 }
	    |            { FREQ S* } { return $1 }
	}
	
	# function
	#   : FUNCTION S* expr ')' S*
	#   ;
	$pg add function { FUNCTION S* expr RPAREN S* } { return [list $1 $3] }
	
	# /*
	#  * There is a constraint on the color that it must
	#  * have either 3 or 6 hex-digits (i.e., [0-9a-fA-F])
	#  * after the "#"; e.g., "#000" is OK, but "#abcd" is not.
	#  */
	# hexcolor
	#   : HASH S*
	#   ;
	$pg add hexcolor { HASH S* } { return $1 }
	
	$pg add {
	    S* { S S* } { return S }
	    |  { S } { return S }
	    |  {} {}
	}
	#$pg configure -verbose 3
    }

    proc Timer {} {
	lappend result MkScanner [time {MkScanner; ::css::Scanner destroy}]
	lappend result MkParser [time {MkParser; ::css::Parser destroy}]
	variable sg; lappend result SG [time {eval [$sg dump]}]
	variable pg; lappend result PG [time {eval [$pg dump]}]
	return $result
    }

    #
    # Now, build and eval the scanner and parser
    #
    MkScanner; eval [$sg dump]
    MkParser; eval [$pg dump]
    
    #
    # identity - turn parse tree into CSS
    #
    proc identity { css { indent 0 } } {
	set in0 [string repeat " " $indent]
	set in1 [string repeat " " [expr {$indent+2}]]

	foreach { type block } $css {

	    switch $type {
		ruleset {
		    lassign $block name rules
		    puts "$in0[join $name ", "] \{ "
		    foreach rule $rules {
			lassign $rule name value
			puts "$in1$name: $value;"
		    }
		    puts "$in0\}"
		}
		media {
		    lassign $block type rules
		    puts "$in0@media $type \{"
		    ppcss $rules 2
		    puts "$in0\}"
		}

	    }

	}
    }
    Scanner ::css::scanner
    Parser ::css::parser -scanner ::css::scanner

    proc parse {text} {
	scanner start $text
	parser reset
	return [parser parse]
    }

    namespace export -clear *
    namespace ensemble create -subcommands {}
}

#
# And, the test code..
#
if {[info exists argv0] && [file normalize $argv0] eq [file normalize [info script]]} {
    #
    # .. this is a simple pretty-printer for lists
    #
    proc pplist { l { indent 0 } } {

	if { !$indent } {
	    puts "\{"
	    pplist $l [expr {$indent+1}]
	    puts "\}"
	} else {

	    foreach v $l {

		set i [string repeat "  " $indent]

		if { [llength $v] <= 1 } {
		    puts "$i$v"
		} else {
		    puts "$i\{"
		    pplist $v [expr {$indent+1}]
		    puts "$i\}"
		}

	    }

	}

    }

    if {[llength $argv] == 0} {

	puts "Usage:  $argv0 (demo|dump|parse)"
	puts ""
	puts "The \"demo\" mode runs the parser on a small test-case.  The \"dump\""
	puts "mode dumps out the lexer and scanner code.  The \"parse\" mode runs"
	puts "the parser on stdin."

    } else {

	set demo {
	    @charset "utf-8";
	    body, h1 {
		margin: 0px;
		font-family: "URW Palladio L", Palatino, "Times New Roman", Times, serif;
		color: #000;
	    }
	    input {
		background-color: #111;
		filter:alpha(opacity=40);
	    }
	    .button:moop, .button:hover {
		border: solid thin red !important;
	    }
	    .button img { float: left ; }
	    .button img.nopad { padding: 0px; }
	}

	switch [lindex $argv 0] {
	    demo {

		#
		# Demo mode - use a simple built-in test case
		#
		#
		# Run the scanner
		#
		puts [::css parse $demo]

	    }

	    time {
		puts PARSE:[time {::css parse $demo}]
		puts Scanner:[time {::css::Scanner #auto}]
		puts Parser:[time {::css::Parser #auto -scanner ::css::scanner}]
		puts [::css::Timer]
	    }

	    dump {
		#
		# Dump the scanner and lexer code
		#
		puts [::css::Scanner dump]
		puts [::css::Parser dump]
	    }

	    parse {
		#
		# Parse on stdin
		#
		while { ![eof stdin] } {
		    append demo [gets stdin]
		}

		#
		# Run the scanner
		#
		pplist [css parse $demo]
	    }

	}

    }
}