1: #!/usr/local/bin/perl 2: # Copyright (c) 1998 Robert Braddock. All rights reserved. 3: # This program is free software; you can redistribute it and/or 4: # modify it under the same terms as Perl itself. 6: package SGML::ElementMap::Driver::XMLParser; 7: use strict; 9: use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $VERSION $REVISION); BEGIN { 10: @ISA = qw( SGML::ElementMap::Driver::EventQueue ); 11: @EXPORT = (); 12: @EXPORT_OK = (); 13: %EXPORT_TAGS = (); 14: $VERSION = 0.2; 15: $REVISION = q$Revision: 1.2 $; 16: } 17: use SGML::ElementMap::Driver::EventQueue (); 18: use Exporter; 21: BEGIN { 22: SGML::ElementMap::Driver::set_obj_constants('IDX',&SGML::ElementMap::Driver::EventQueue::IDX_LAST(), qw(nb_interface source offset)); 23: sub IDX_parser () { return &SGML::ElementMap::Driver::EventQueue::IDX_parser(); } 24: sub IDX_parse_queue () { return &SGML::ElementMap::Driver::EventQueue::IDX_parse_queue(); } 25: sub IDX_input_mode () { return &SGML::ElementMap::Driver::EventQueue::IDX_input_mode(); } 26: sub IDX_markup_mode () { return &SGML::ElementMap::Driver::EventQueue::IDX_markup_mode(); } 27: } 29: use Carp; 30: use Symbol; 31: use lib ('/usr/lib/perl5/XML/Parser'); 32: use XML::Parser; 33: use XML::Parser::Expat; 35: my $debug = 0; 36: my $trace = 0; 37: my $trace_hnd = 0; 39: my $buf_size = 32; 41: eval "use Data::Dumper; 42: $Data::Dumper::Indent = $Data::Dumper::Indent = 0; 43: $Data::Dumper::Terse = $Data::Dumper::Terse = 1;" if $debug; 45: if (!caller()) { die "$0: perl module, not program\n"; } 47: # create empty object 48: # no arguments means default parser setup 49: sub new { 50: my $proto = shift; 51: my $class = ref($proto) || $proto; 53: carp __PACKAGE__.' does not know what to do with arguments' if @_; 54: my $self = $class->SUPER::new(); 55: $self->[&IDX_parser()] = ''; 56: $self->[&IDX_nb_interface()] = ''; 57: $self->[&IDX_source()] = ''; 58: $self->[&IDX_offset()] = ''; 59: 60: warn '--NEW from '.join(', line ',(caller())[1,2])."\n" if $trace; 61: 62: $self = bless $self,$class; 63: return $self; 64: } 66: sub DESTROY { 67: my $p = $_[0]->[&IDX_parser()]; 68: $_[0]->[&IDX_parser()] = ''; 69: warn '--DESTROY from '.join(', line ',(caller())[1,2])."\n" if $trace; 70: if (defined($p) && ref($p) && $p->isa('XML::Parser::Expat')) { 71: $p->release; 72: } 73: } 75: sub parser { 76: my ($self,$parser) = @_; 77: 78: warn '--PARSER from '.join(', line ',(caller())[1,2])."\n" if $trace; 79: my @handlers = ( 'Start' => $self->can( 'handle_start' ), 80: 'End' => $self->can( 'handle_end' ), 81: 'Char' => $self->can( 'handle_char' ), 82: 'Proc' => $self->can( 'handle_proc' ), 83: 'Comment' => $self->can( 'handle_comment' ), 84: 'CdataStart' => $self->can( 'handle_cdata_start' ), 85: 'CdataEnd' => $self->can( 'handle_cdata_end' ), 86: 'Default' => $self->can( 'handle_default' ), 87: ); 88: #warn "HANDLERS".join('][',@handlers)."\n"; 89: my @args = ( 90: #'ProtocolEncoding' => 'UTF-8', 91: 'Namespaces' => 1, 92: 'ParseParamEnt' => 1, 93: ); 94: 95: if (defined($parser) && ref($parser)) { 96: %{ $parser } = @args if @args; 97: $parser->setHandlers(@handlers); 98: #->{'Handlers'} = { 'Start' => \&handle_start, 99: #'End' => \&handle_end, 100: #'not finished -- intentional error', }; 101: } elsif (defined($parser) && $parser) { 102: if ($parser eq 'XML::Parser::ExpatNB') { 103: $parser = $parser->new( #'Handlers'=>{@handlers}, 104: @args); 105: } else { 106: my ($s,$parser) = eval qq{ use $parser (); 107: (1, $parser\->new(\@args)); }; 108: warn $@ unless defined $s; 109: } 110: return undef unless $parser || ''; 111: $parser->setHandlers(@handlers); 112: } 113: return $self->SUPER::parser($parser); 114: } 116: #start new traversal (for use by users) 117: #passed main module as first (normal) argument, rest are process specific 118: sub process { 119: my ($self,$main,$source,@list) = @_; 120: 121: warn '--PROCESS from '.join(', line ',(caller())[1,2])."\n" if $trace; 122: if ($self->[&IDX_markup_mode()] eq 'sgml') { 123: croak __PACKAGE__." can only parse XML documents"; 124: } elsif ($self->[&IDX_markup_mode()] eq 'xml') { 125: if (! $self->[&IDX_parser()]) { # make default parser object 126: croak "failed to instantiate parser" unless 127: defined $self->parser('XML::Parser::ExpatNB'); 128: } 129: } else { 130: croak "unknown markup mode '".$self->[&IDX_markup_mode()]."'\n"; 131: } 133: my $input_type = $self->[&IDX_input_mode()]; 134: if (!$input_type || $input_type eq 'default') { # infer type 135: if (ref($source) eq 'SCALAR') { 136: $input_type = 'literal'; 137: } elsif (ref $source) { 138: $input_type = 'handle'; 139: } else { 140: $input_type = 'file'; 141: } 142: } 144: if ($input_type eq 'file' || $input_type eq 'path') { 145: my $fh = new IO::File($source,'r'); 146: croak "failed to open $source: $!\n" unless defined $fh; 147: $source = $fh; 148: $input_type = 'handle'; 149: } # (not else) 150: if ($input_type eq 'handle') { 151: $self->[&IDX_source()] = $source; 152: $self->[&IDX_offset()] = ''; 153: } elsif ($input_type eq 'literal' || $input_type eq 'string') { 154: $self->[&IDX_offset()] = 0; 155: $self->[&IDX_source()] = ref($source) ? $source : \$source; 156: } else { 157: die "unknown input type '".$input_type."'"; 158: } 160: # ready the parser 161: my $parser = $self->[&IDX_parser()]; 162: return undef unless 163: $self->[&IDX_nb_interface()] = $parser; #->parse_start(); 165: $parser->{__PACKAGE__} = $self; # need access from handlers 167: @list = $self->SUPER::process($main,'',@list); 169: # break circular ref 170: $parser->{__PACKAGE__} = '';# = $self; 172: warn __PACKAGE__."return from process" if $debug; 173: return @list; 174: } 176: # override superclass 177: sub process_sgml_file { 178: my ($self,$main,$file,@list) = @_; 179: croak "XML::Parser::Expat driver only handles XML documents"; 180: } 182: sub markup { 183: my ($self,$mode) = @_; 184: if (defined($mode) && lc($mode) eq 'sgml') { 185: return undef; 186: #croak __PACKAGE__." can only parse XML documents"; 187: } else { 188: return $self->SUPER::markup($mode); 189: } 190: } 192: sub trace { 193: my ($arg) = @_; 194: if (defined $arg) { 195: $debug = $arg; 196: } else { 197: $debug = 1; 198: } 199: } 202: # parse queue needs more stuff 203: sub queue_more_events { 204: my ($self,$queue) = @_; 205: my $buf = ''; 206: my ($offset) = \@{ $self }[&IDX_offset()]; 207: my ($parser, $source) = @{ $self }[&IDX_parser(), &IDX_source()]; 208: while (@$queue == 0) { 209: croak "parse attempt after parsing already finished" unless $parser; 210: # TODO 211: # can use $parser->skip_until(INDEX) when doing an 'until close' skip 212: 213: SCAN: { 214: if ($$offset ne '') { # source is a string ref 215: last SCAN unless length($$source) > $$offset; 216: $buf = substr($$source, $$offset, $buf_size); 217: $$offset += $buf_size; 218: } else { # source is a IO::Handle 219: die "lost source handle: $!" unless $source; 220: last SCAN if $source->eof; 221: my $s = $source->read($buf,$buf_size); 222: die "TODO: some error occurred (maybe $!)" unless $s; 223: } 224: } 225: if ($buf) { 226: warn "PARSE[$$offset][$source][".$buf."]\n" if $trace_hnd; 227: $parser->parse_more($buf); 228: #ERROR CODE? 229: 230: } else { 231: #warn 'CALL PARSE DONE' if $trace; 232: #push @{ $self->{'parse queue'} }, ('end','',''); 233: push @{ $self->[&IDX_parse_queue()] }, ('','conforming',''); 234: $parser->parse_done(); #ERROR CODE? 235: $self->[&IDX_parser()] = ''; # Expat kills itself here, so lose reference 236: return 0; 237: } 238: } 239: return 1; 240: } 243: # object, name, [name,val, ...] 244: sub handle_start { 245: my ($parser,$element,@attrs) = @_; 246: my $spec_count = ($parser->specified_attr || 0) / 2; # total count 247: warn "START:$element:[[$spec_count]]:".join(':',@attrs)."\n" if $trace_hnd; 248: my $self = $parser->{__PACKAGE__}; 249: warn '--HANDLE_START from '.join(', line ',(caller())[1,2])."\n" if $trace; 251: my @names = (); 252: for(my $i=0; $i<@attrs; $i+=2) { 253: #last unless $i < 2*$spec_count; 254: push @names, $attrs[$i]; # attr name 255: } 256: my $obj = {'Type' => 'Element', 257: 'Name' => $element, 258: 'Attributes' => { @attrs }, 259: 'AttributeOrder' => [ @names ], 260: 'Defaulted' => $spec_count }; 262: push @{$self->[&IDX_parse_queue()]}, ('start_element', 263: $obj, 264: '',); 265: } 267: # obj, name 268: sub handle_end { 269: my ($parser, $element) = @_; 270: warn "END:$element\n" if $trace_hnd; 271: my $self = $parser->{__PACKAGE__}; 272: push @{$self->[&IDX_parse_queue()]},('end_element', 273: scalar($self->nsname($element)), 274: ''); 275: #$parser->original_string()]; 276: } 278: #obj, utf8-string 279: sub handle_char { 280: my ($parser,$text) = @_; 281: warn "CHAR:$text\n" if $trace_hnd; 282: my $self = $parser->{__PACKAGE__}; 283: #$text=$self->encode($text); 284: my $queue = $self->[&IDX_parse_queue()]; 285: #join with previous event if it's text too 286: #if (@$queue > 0 && $queue->[$#$queue-2] eq 'cdata') { 287: # $queue->[$#$queue-1] .= $text; 288: # #$self->{'parse queue'}[-1][-1].=$parser->original_string(); 289: #} else { 290: push @$queue,('cdata',{'Data'=>($text || '')},''); 291: #} 292: } 295: sub handle_proc { 296: my ($parser,$target,$value) = @_; 297: my $self = $parser->{__PACKAGE__}; 298: push @{$self->[&IDX_parse_queue()]},('pi', 299: { 'Target' => ($target || ''), 300: 'Data' => ($value||'') }, 301: ''); 302: } 304: sub handle_comment { 305: my ($parser,$text) = @_; 306: my $self=$parser->{__PACKAGE__}; 307: push @{$self->[&IDX_parse_queue()]},('comment', {'Data'=>($text||'')}, ''); 308: } 310: sub handle_cdata_start { 311: my ($parser) = @_; 312: my $self=$parser->{__PACKAGE__}; 313: warn "ignoring cdata end\n" if $trace_hnd; 314: #push @{$self->{output}}, 315: } 317: sub handle_cdata_end { 318: my ($parser) = @_; 319: my $self = $parser->{__PACKAGE__}; 320: warn "ignoring cdata end\n" if $trace_hnd; 321: #push @{$self->{output}}, 322: } 324: # any chars that aren't fed to another handler (including markup chars) 325: sub handle_default { 326: my ($parser,$string) = @_; 327: my $self = $parser->{__PACKAGE__}; 328: warn "ignoring [".$string."]\n" if $trace_hnd; 329: #push @{$self->{output}}, 330: } 335: sub nsname { 336: my ($self,$name) = @_; 337: my $parser = $self->[&IDX_parser()]; 338: my $ns_uri = ''; 339: if ($parser->{'Namespaces'}) { 340: $ns_uri = $parser->namespace($name) || ''; 341: } 342: $name = $self->encode($name); 343: return wantarray ? ($name, $ns_uri) : $name; 344: } 346: # all chars in UTF8 -- so? 347: sub encode { 348: my ($self,$text)=@_; 349: return $text; 350: #if ($self->{latin}) { 351: # $text=~s{([\xc0-\xc3])(.)}{ 352: # my $hi = ord($1); 353: # my $lo = ord($2); 354: # chr((($hi & 0x03) <<6) | ($lo & 0x3F)) 355: # }ge; 356: #} 357: #$text; 358: } 360: 1; 362: __END__ 364: CdataStart (Parser) 365: This is called at the start of a CDATA section. 367: CdataEnd (Parser) 368: This is called at the end of a CDATA section. 370: Default (Parser, String) 371: This is called for any characters that don't have 372: a registered handler. This includes both charac 373: ters that are part of markup for which no events 374: are generated (markup declarations) and characters 375: that could generate events, but for which no han 376: dler has been registered. 378: Whatever the encoding in the original document, 379: the string is returned to the handler in UTF-8. 381: Unparsed (Parser, Entity, Base, Sysid, 382: Pubid, Notation) 383: This is called for a declaration of an unparsed 384: entity. Entity is the name of the entity. Base is 385: the base to be used for resolving a relative URI. 386: Sysid is the system id. Pubid is the public id. 387: Notation is the notation name. Base and Pubid may 388: be undefined. 390: Notation (Parser, Notation, Base, Sysid, 391: Pubid) 392: This is called for a declaration of notation. 393: Notation is the notation name. Base is the base 394: to be used for resolving a relative URI. Sysid is 395: the system id. Pubid is the public id. Base, 396: Sysid, and Pubid may all be undefined. 399: ExternEnt (Parser, Base, Sysid, Pubid) 400: This is called when an external entity is refer 401: enced. Base is the base to be used for resolving a 402: relative URI. Sysid is the system id. Pubid is the 403: public id. Base, and Pubid may be undefined. 405: This handler should either return a string, which 406: represents the contents of the external entity, or 407: return an open filehandle that can be read to 408: obtain the contents of the external entity, or 409: return undef, which indicates the external entity 410: couldn't be found and will generate a parse error. 412: If an open filehandle is returned, it must be 413: returned as either a glob (*FOO) or as a reference 414: to a glob (e.g. an instance of IO::Handle). 416: ExternEntFin (Parser) 417: This is called after an external entity has been 418: parsed. It allows applications to perform cleanup 419: on actions performed in the above ExternEnt han 420: dler. 422: Entity (Parser, Name, Val, Sysid, Pubid, 423: Ndata, IsParam) 424: This is called when an entity is declared. For 425: internal entities, the Val parameter will contain 426: the value and the remaining three parameters will 427: be undefined. For external entities, the Val 428: parameter will be undefined, the Sysid parameter 429: will have the system id, the Pubid parameter will 430: have the public id if it was provided (it will be 431: undefined otherwise), the Ndata parameter will 432: contain the notation for unparsed entities. If 433: this is a parameter entity declaration, then the 434: IsParam parameter is true. 436: Note that this handler and the Unparsed handler 437: above overlap. If both are set, then this handler 438: will not be called for unparsed entities. 440: Element (Parser, Name, Model) 441: The element handler is called when an element dec 442: laration is found. Name is the element name, and 443: Model is the content model as an XML::Parser::Con 444: tentModel object. See the section on 445: "XML::Parser::ContentModel Methods" for methods 446: available for this class. 448: Attlist (Parser, Elname, Attname, Type, 449: Default, Fixed) 450: This handler is called for each attribute in an 451: ATTLIST declaration. So an ATTLIST declaration 452: that has multiple attributes will generate multi 453: ple calls to this handler. The Elname parameter is 454: the name of the element with which the attribute 455: is being associated. The Attname parameter is the 456: name of the attribute. Type is the attribute type, 457: given as a string. Default is the default value, 458: which will either be "#REQUIRED", "#IMPLIED" or a 459: quoted string (i.e. the returned string will begin 460: and end with a quote character). If Fixed is true, 461: then this is a fixed attribute. 465: Doctype (Parser, Name, Sysid, Pubid, 466: Internal) 467: This handler is called for DOCTYPE declarations. 468: Name is the document type name. Sysid is the sys 469: tem id of the document type, if it was provided, 470: otherwise it's undefined. Pubid is the public id 471: of the document type, which will be undefined if 472: no public id was given. Internal will be true or 473: false, indicating whether or not the doctype dec 474: laration contains an internal subset. 476: DoctypeFin (Parser) 477: This handler is called after parsing of the DOC 478: TYPE declaration has finished, including any 479: internal or external DTD declarations. 481: XMLDecl (Parser, Version, Encoding, Stan 482: dalone) 483: This handler is called for XML declarations. Ver 484: sion is a string containg the version. Encoding is 485: either undefined or contains an encoding string. 486: Standalone is either undefined, or true or false. 487: Undefined indicates that no standalone parameter 488: was given in the XML declaration. True or false 489: indicates "yes" or "no" respectively.