# normal.pl
# This is a perl script
# SGML normaliser, Henry S. Thompson, HCRC (hthompson@uk.ac.edinburgh)

# evaluates its first argument, if defaults need changing
# use "do <file>" if you've got a lot

# based on sgmls.pl from the sgmls distribution
# pipe sgmls output into this and get normalised sgml back out

# only covers elements and data at the moment
# volunteers to extend it welcome

# set @tags_only to an array to suppress data and end tags,
# e.g. for tag counting.  Array contents are universal att_order filter,
# e.g. (type) for printing of type attributes (if any) only.

# attribute printing tries to be smart -- no IMPLIED, quotes only when needed
# will suppress defaults per %defaults(attribute,default-value,...)
# will sort attributes for element by %att_order(element,a1;a2;a3,...)

# this version is VERY simplistic in its treatment of attributes, but
# four times faster as a result

# defaults to output entity references for & and < where necessary
# change %entities to change this -- you can use the evaluation of 
# first arg to achieve this, e.g. post.pl 'push(@entities,"s/banana/&test;/g")'

# Use @blank_line_between(1,2,...) to allow/force blank lines between lines
# matching patterns 1 and 2

# DMCK 3/12/93 Added new parameter @empty which is a pattern which matches
# empty elements i.e. those which do not need to have endtags
# eg $empty="(pb|milestone|omit|ptr)"

# HT 10/2/94 Added new flag keep_blanks which if true will not eliminate
# initial blanks

# Here's an example script I use for invocation for the TEI DTDs:
###############################################################################
# #!/bin/sh
# # post-process sgmls output to make canonical version or tag listing
# def='("format","normal","full","yes","direct","unspecified","org","uniform",
#       "part","n","sample","complete","place","unspecified","targorder","u",
#       "anchored","yes",
#       "gram","yes","rel","notrel","vg","notvg","coord","notcoord",
#       "fvs","notfvs")'
# # blb='("/./","/^<div/","m|^</|","/^<p[ >]/","/^<div/","/./")'
# if [ "$1" = "-to" ]
#  then
#   tof='@tags_only=("type","id");'
#   shift
#  else
#   blb='@blank_line_between=("/./","/^<div.?\$/","m|^</|","/^<(p|table|list|head|tail|trailer|item)\$/");'
# fi
# if [ "$1" = "-kib" ]
#  then
#   kib='$keep_blanks=1;'
#   shift
# fi
# if [ "$1" = "-kent" ]
#  then
#   ent='@entities=();'
#   shift
#  else
#   ent='@entities=("s|&([a-zA-Z])|&amp;\$1|g","s/<([a-zA-Z])/&lab;\$1/g",
#                   "s/&@/&/g");'
# fi
# ato='("div0","type","div1","type","div2","type","div3","type","div4","type","head","type","p","type","corr","sic;rend")'
# perl normal.pl "$ent\
# 		  %defaults=$def;\
# 		  \$empty='(omit|pb|lb|milestone|rule)';\
# 		  $blb\
# 		  $tof\
# 		  $kib\
# 		  %att_order=$ato"
###############################################################################


$bol=1;

$keep_blanks=0;

$prog = $0;

$prog =~ s|.*/||;

$backslash_in_data = "\\";

@entities=("s/&([a-zA-Z])/&amp;\$1/g",
	   "s#<([/a-zA-Z])#&lab;\$1#g");

@tags_only = ();

eval $ARGV[0];
warn $@ if ($@);

# compute the doent filter based on @entities

$ep="sub doent {";
foreach $subst (@entities) {
    $ep.="\$_[0] =~ $subst;"
    };
$ep.="
warn \"initial <\" if (/(^|\\n)</);
warn \"untouched </\" if (m|</|);
}";
# print "$ep\n";
eval $ep;
warn $@ if ($@);

# compute the blb filter based on @blank_line_between

$bb="sub blb {
local(\$prev,\$cur)=\@_;
(";
while (@blank_line_between) {
    $bb.="((\$prev=~".
         shift(@blank_line_between).
         ") && (\$cur=~".
         shift(@blank_line_between).
         "))";
    $bb.=" || " if (@blank_line_between);
    };
$bb.=")
}";
# print "$bb\n";
eval $bb;
warn $@ if ($@);

while (<STDIN>) {
    chop;
    $command = substr($_, 0, 1);
    substr($_, 0, 1) = "";
    if ($command eq '(') {
	tr/A-Z/a-z/;		# all my tags are l.c. -- ht
	&start_element($_);
	%attribute_type=();
	%attribute_value=();
    }
    elsif ($command eq ')') {
	tr/A-Z/a-z/;		# all my tags are l.c. -- ht
	&end_element($_) unless (@tags_only);
    }
    elsif ($command eq '-') {
	if (!@tags_only) {
             &unescape_data($_);
             &data($_);
        };
    }
    elsif ($command eq 'A') {
	@field = split(/ /, $_, 3);
	$field[0]=~tr/A-Z/a-z/;		# all my attributes are l.c. -- ht
        $field[2]=~tr/A-Z/a-z/ unless ($field[1] eq "CDATA");
	$attribute_type{$field[0]} = $field[1];
	&unescape_data($field[2]);
	$attribute_value{$field[0]} = $field[2];
    }
#    elsif ($command eq '&') {
#	&entity($_);
#    }
#    elsif ($command eq 'D') {
#	@field = split(/ /, $_, 4);
#	$data_attribute_type{$field[0], $field[1]} = $field[2];
#	&unescape_data($field[3]);
#	$data_attribute_value{$field[0], $field[1]} = $field[3];
#    }
#    elsif ($command eq 'N') {
#	$notation{$_} = 1;
#	if (defined($sysid)) {
#	    $notation_sysid{$_} = $sysid;
#	    undef($sysid);
#	}
#	if (defined($pubid)) {
#	    $notation_pubid{$_} = $pubid;
#	    undef($pubid);
#	}
#    }
#    elsif ($command eq 'I') {
#        @field = split(/ /, $_, 3);
#	$entity_type{$field[0]} = $field[1];
#	&unescape($field[2]);
#	# You may want to substitute \e for \ if the type is CDATA.
#	$entity_text{$field[0]} = $field[2];
#	$entity_code{$field[0]} = 'I';
#    }
#    elsif ($command eq 'E') {
#	@field = split(/ /, $_);
#	$entity_code{$field[0]} = 'E';
#	$entity_type{$field[0]} = $field[1];
#	$entity_notation{$field[0]} = $field[2];
#	if (defined(@files)) {
#	    foreach $i (0..$#files) {
#		$entity_filename{$field[0], $i} = $files[i];
#	    }
#	    undef(@files);
#	}
#	if (defined($sysid)) {
#	    $entity_sysid{$field[0]} = $sysid;
#	    undef($sysid);
#	}
#	if (defined($pubid)) {
#	    $entity_pubid{$field[0]} = $pubid;
#	    undef($pubid);
#	}
#    }
#    elsif ($command eq 'S') {
#	$entity_code{$_} = 'S';
#	if (defined(@files)) {
#	    foreach $i (0..$#files) {
#		$entity_filename{$_, $i} = $files[i];
#	    }
#	    undef(@files);
#	}
#	if (defined($sysid)) {
#	    $entity_sysid{$_} = $sysid;
#	    undef($sysid);
#	}
#	if (defined($pubid)) {
#	    $entity_pubid{$_} = $pubid;
#	    undef($pubid);
#	}
#    }
#    elsif ($command eq '?') {
#	&unescape($_);
#	&pi($_);
#    }
#    elsif ($command eq 'L') {
#	@field = split(/ /, $_);
#	$lineno = $field[0];
#	if ($#field >= 1) {
#	    &unescape($field[1]);
#	    $filename = $field[1];
#	}
#    }
#    elsif ($command eq 'V') {
#	@field = split(/ /, $_, 2);
#	&unescape($field[1]);
#	$environment{$field[0]} = $field[1];
#    }
#    elsif ($command eq '{') {
#	&start_subdoc($_);
#    }
#    elsif ($command eq '}') {
#	&end_subdoc($_);
#    }
#    elsif ($command eq 'f') {
#	&unescape($_);
#	push(@files, $_);
#    }
#    elsif ($command eq 'p') {
#	&unescape($_);
#	$pubid = $_;
#    }
#    elsif ($command eq 's') {
#	&unescape($_);
#	$sysid = $_;
#    }
    elsif ($command eq 'C') {
	$conforming = 1;
    }
    else {
	warn "$prog:$ARGV:$.: unrecognized command \`$command'\n";
    }
}
die "Not conforming" unless ($conforming);

sub unescape {
    $_[0] =~ s/\\([0-7][0-7]?[0-7]?|.)/&esc($1)/eg;
}

sub esc {
    local($_) = $_[0];
    if ($_ eq '012' || $_ eq '12') {
	"";			# ignore RS
    }
    elsif (/^[0-7]/) {
	sprintf("%c", oct);
    }
    elsif ($_ eq 'n') {
	"\n";
    }
    elsif ($_ eq '|') {
	"";
    }
    elsif ($_ eq "\\") {
	"\\";
    }
    else {
	$_;
    }
}

sub unescape_data {
    local($sdata) = 0;
    $_[0] =~ s/\\([0-7][0-7]?[0-7]?|.)/&esc_data($1)/eg;
    &doent;
    $_[0] =~ s/\n+/\n/g;
}

sub esc_data {
    local($_) = $_[0];
    if ($_ eq '012' || $_ eq '12') {
	"";			# ignore RS '
    }
    elsif (/^2[0-3][0-7]$/) {
	# disallowed range
	sprintf("&#%d;", oct);
    }
    elsif (/^[0-7]/) {
	sprintf("%c", oct);
    }
    elsif ($_ eq 'n') {
	"\n";
    }
    elsif ($_ eq '|') {
	$sdata = !$sdata;
	"";
    }
    elsif ($_ eq "\\") {
	$sdata ? "\\" : $backslash_in_data; # I don't under stand this -- ht
    }
    else {
	$_;
    }
}


sub start_element {
    local($gi) = $_[0];
    &pprint("<$gi");
    if (! (@order=split(";",$print_order{$gi}))) {
	# need to compute standard attribute order
	# do we have some constraints
	if (@tags_only) {
	    foreach $tkey (@tags_only) {
		push(@order,$tkey) if ($attribute_type{$tkey});
	    };
	}
	else {
	    if ($co=$att_order{$gi}) {
		push(@order,split(";",$co));
	    };
	    foreach $akey (sort keys %attribute_value) {
		push(@order,$akey) unless ($co =~ /(^|;)$akey(;|$)/);
				       };
	};
	$print_order{$gi}=join(";",@order);
    };
    foreach $skey (@order) {
	&print_av($skey,$attribute_value{$skey},
		  $attribute_type{$skey});
    };
    &pprint(">\n");
}

sub print_av {
    local($name,$value,$type)=@_;
#    print "\n$name:$value:$defaults{$name}:$type\n";
    if ($value ne "") {
	if ($defaults{$name} ne $value) {
	    &pprint(" $name=");
	    $qt='"';
	    if ((! ($clean=($value!~/[^-.a-z0-9A-Z]/))) &&
		($value=~/"/)) { # " )) {
		if ($value=~/'/) { # ' ){
		    $value=~s/'/&sq;/g; # can't manage both so replace one
		};
		$qt="'";
	    };
	    &pprint($qt) unless ($clean);
	    &pprint($value);
	    &pprint($qt) unless ($clean);
	};
    }
    else {
	die "$name has no value, but type is $type" unless ($type eq "IMPLIED");
    };
}

sub end_element {
    local($gi) = $_[0];
#    return if ($gi eq "hack");
    &pprint("</$gi>\n") unless ($gi=~/^$empty$/o);
}

sub data {
    &pprint($_);
    &pprint("\n");
}

# all printing comes through here, to enforce blank lines all and only
# as determined by @blank-line-between
sub pprint {
    local($lines)=@_;
    # first take care of initial \n, if any
    if ($lines =~ /^\n/) {
	$lines=$'; #'
	if (! $bol) {
	    print "\n";
	    $bol=1;
	};
    };
    return if ($lines eq "");
    # now multiple lines
    while ($lines =~ s/^(.*\n)(.)/$2/) {
	&pprint($1);
    };
    # initial (originally internal) \n out
    $lines =~ s/^\n+//;
    return if ($lines eq "");
    # now have single 'line', with possible \n at end
    if ($bol) {
#	print ">>$previous::$lines:::\n";
	$lines=~s/^[ \t]+// unless $keep_blanks;
	print "\n" if (&blb($previous,$lines));
	$previous=$lines;
    };
    $lines=~s/[ \t]+$// unless $keep_blanks;
    print $lines;
    $bol=($lines =~ /\n$/) unless ($lines eq "");
}

	

# A processing instruction.

sub pi {
    local($data) = $_[0];
    die "pi";
}

# A reference to an external entity.

sub entity {
    local($name) = $_[0];
    die "ext";
}

sub start_subdoc {
    local($name) = $_[0];
    die "bsd";
}

sub end_subdoc {
    local($name) = $_[0];
    die "esd";
}

