1: #!/usr/local/bin/perl
  2: # Copyright (c) 1998 Robert Braddock. All rights reserved.
  3: # This program is free software; you can redistribute it and/or
  4: # modify it under the same terms as Perl itself.  

  6: package SGML::ElementMap::Driver::XMLParser;
  7: use strict;

  9: use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $VERSION $REVISION);  BEGIN {
 10:     @ISA =       qw( SGML::ElementMap::Driver::EventQueue );
 11:     @EXPORT =      ();
 12:     @EXPORT_OK =   ();
 13:     %EXPORT_TAGS = ();
 14:     $VERSION =     0.2;
 15:     $REVISION =    q$Revision: 1.2 $;
 16: }
 17: use SGML::ElementMap::Driver::EventQueue ();
 18: use Exporter;


 21: BEGIN {
 22: SGML::ElementMap::Driver::set_obj_constants('IDX',&SGML::ElementMap::Driver::EventQueue::IDX_LAST(), qw(nb_interface source offset));
 23: sub IDX_parser () { return &SGML::ElementMap::Driver::EventQueue::IDX_parser(); }
 24: sub IDX_parse_queue () { return &SGML::ElementMap::Driver::EventQueue::IDX_parse_queue(); }
 25: sub IDX_input_mode () { return &SGML::ElementMap::Driver::EventQueue::IDX_input_mode(); }
 26: sub IDX_markup_mode () { return &SGML::ElementMap::Driver::EventQueue::IDX_markup_mode(); }
 27: }

 29: use Carp;
 30: use Symbol;
 31: use lib ('/usr/lib/perl5/XML/Parser');
 32: use XML::Parser;
 33: use XML::Parser::Expat;

 35: my $debug = 0;
 36: my $trace = 0;
 37: my $trace_hnd = 0;

 39: my $buf_size = 32;

 41: eval "use Data::Dumper;
 42: $Data::Dumper::Indent = $Data::Dumper::Indent = 0;
 43: $Data::Dumper::Terse  = $Data::Dumper::Terse  = 1;" if $debug;

 45: if (!caller()) { die "$0: perl module, not program\n"; }

 47: # create empty object
 48: # no arguments means default parser setup
 49: sub new {
 50:     my $proto = shift;
 51:     my $class = ref($proto) || $proto;

 53:     carp __PACKAGE__.' does not know what to do with arguments' if @_;
 54:     my $self = $class->SUPER::new();
 55:     $self->[&IDX_parser()] = '';
 56:     $self->[&IDX_nb_interface()] = '';
 57:     $self->[&IDX_source()] = '';
 58:     $self->[&IDX_offset()] = '';
 59:     
 60:     warn '--NEW from '.join(', line ',(caller())[1,2])."\n" if $trace;
 61:     
 62:     $self = bless $self,$class;
 63:     return $self;
 64: }

 66: sub DESTROY {
 67:     my $p = $_[0]->[&IDX_parser()];
 68:     $_[0]->[&IDX_parser()] = '';
 69:     warn '--DESTROY from '.join(', line ',(caller())[1,2])."\n" if $trace;
 70:     if (defined($p) && ref($p) && $p->isa('XML::Parser::Expat')) {
 71:         $p->release;
 72:     }
 73: }

 75: sub parser {
 76:     my ($self,$parser) = @_;
 77:     
 78:     warn '--PARSER from '.join(', line ',(caller())[1,2])."\n" if $trace;
 79:     my @handlers = ( 'Start' =>      $self->can( 'handle_start'       ),
 80:                      'End'    =>     $self->can( 'handle_end'         ),
 81:                      'Char'    =>    $self->can( 'handle_char'        ),
 82:                      'Proc'     =>   $self->can( 'handle_proc'        ),
 83:                      'Comment'   =>  $self->can( 'handle_comment'     ),
 84:                      'CdataStart' => $self->can( 'handle_cdata_start' ),
 85:                      'CdataEnd'  =>  $self->can( 'handle_cdata_end'   ),
 86:                      'Default'  =>   $self->can( 'handle_default'     ),
 87:                  );
 88:     #warn "HANDLERS".join('][',@handlers)."\n";
 89:     my @args = (
 90:                 #'ProtocolEncoding' => 'UTF-8',
 91:                 'Namespaces' => 1,
 92:                 'ParseParamEnt' => 1,
 93:                 );
 94:     
 95:     if (defined($parser) && ref($parser)) {
 96:         %{ $parser } = @args if @args;
 97:         $parser->setHandlers(@handlers);
 98:         #->{'Handlers'} = { 'Start' => \&handle_start,
 99:         #'End' => \&handle_end,
100:         #'not finished -- intentional error', };
101:     } elsif (defined($parser) && $parser) {
102:         if ($parser eq 'XML::Parser::ExpatNB') {
103:             $parser = $parser->new( #'Handlers'=>{@handlers},
104:                                    @args);
105:         } else {
106:             my ($s,$parser) = eval qq{ use $parser (); 
107:                                        (1, $parser\->new(\@args)); };
108:             warn $@ unless defined $s;
109:         }
110:         return undef unless $parser || '';
111:         $parser->setHandlers(@handlers);
112:     }
113:     return $self->SUPER::parser($parser);
114: }

116: #start new traversal (for use by users)
117: #passed main module as first (normal) argument, rest are process specific
118: sub process {
119:     my ($self,$main,$source,@list) = @_;
120:     
121:     warn '--PROCESS from '.join(', line ',(caller())[1,2])."\n" if $trace;
122:     if ($self->[&IDX_markup_mode()] eq 'sgml') {
123:         croak __PACKAGE__." can only parse XML documents";
124:     } elsif ($self->[&IDX_markup_mode()] eq 'xml') {
125:         if (! $self->[&IDX_parser()]) { # make default parser object
126:             croak "failed to instantiate parser" unless
127:                 defined $self->parser('XML::Parser::ExpatNB');
128:         }
129:     } else {
130:         croak "unknown markup mode '".$self->[&IDX_markup_mode()]."'\n";
131:     }

133:     my $input_type = $self->[&IDX_input_mode()];
134:     if (!$input_type || $input_type eq 'default') { # infer type
135:         if (ref($source) eq 'SCALAR') {
136:             $input_type = 'literal';
137:         } elsif (ref $source) {
138:             $input_type = 'handle';
139:         } else {
140:             $input_type = 'file';
141:         }
142:     }

144:     if ($input_type eq 'file' || $input_type eq 'path') {
145:         my $fh = new IO::File($source,'r');
146:         croak "failed to open $source: $!\n" unless defined $fh;
147:         $source = $fh;
148:         $input_type = 'handle';
149:     } # (not else)
150:     if ($input_type eq 'handle') {
151:         $self->[&IDX_source()] = $source;
152:         $self->[&IDX_offset()] = '';
153:     } elsif ($input_type eq 'literal' || $input_type eq 'string') {
154:         $self->[&IDX_offset()] = 0;
155:         $self->[&IDX_source()] = ref($source) ? $source : \$source;
156:     } else {
157:         die "unknown input type '".$input_type."'";
158:     }

160:     # ready the parser
161:     my $parser = $self->[&IDX_parser()];
162:     return undef unless
163:         $self->[&IDX_nb_interface()] = $parser; #->parse_start();

165:     $parser->{__PACKAGE__} = $self; # need access from handlers

167:     @list = $self->SUPER::process($main,'',@list);

169:     # break circular ref
170:     $parser->{__PACKAGE__} = '';# = $self;

172:     warn __PACKAGE__."return from process" if $debug;
173:     return @list;
174: }

176: # override superclass
177: sub process_sgml_file {
178:     my ($self,$main,$file,@list) = @_;
179:     croak "XML::Parser::Expat driver only handles XML documents";
180: }

182: sub markup {
183:     my ($self,$mode) = @_;
184:     if (defined($mode) && lc($mode) eq 'sgml') {
185:         return undef;
186:         #croak __PACKAGE__." can only parse XML documents";
187:     } else {
188:         return $self->SUPER::markup($mode);
189:     }   
190: }

192: sub trace {
193:     my ($arg) = @_;
194:     if (defined $arg) {
195:         $debug = $arg;
196:     } else {
197:         $debug = 1;
198:     }
199: }


202: # parse queue needs more stuff
203: sub queue_more_events {
204:     my ($self,$queue) = @_;
205:     my $buf = '';
206:     my ($offset) = \@{ $self }[&IDX_offset()];
207:     my ($parser, $source) = @{ $self }[&IDX_parser(), &IDX_source()];
208:     while (@$queue == 0) {
209:         croak "parse attempt after parsing already finished" unless $parser;
210:         # TODO 
211:         # can use $parser->skip_until(INDEX) when doing an 'until close' skip
212:         
213:       SCAN: {
214:           if ($$offset ne '') { # source is a string ref
215:               last SCAN unless length($$source) > $$offset;
216:               $buf = substr($$source, $$offset, $buf_size);
217:               $$offset += $buf_size;
218:           } else { # source is a IO::Handle
219:               die "lost source handle: $!" unless $source;
220:               last SCAN if $source->eof;
221:               my $s = $source->read($buf,$buf_size);
222:               die "TODO: some error occurred (maybe $!)" unless $s;
223:           }
224:       }
225:         if ($buf) {
226:             warn "PARSE[$$offset][$source][".$buf."]\n" if $trace_hnd;
227:             $parser->parse_more($buf);
228:             #ERROR CODE?
229:                  
230:         } else { 
231:             #warn 'CALL PARSE DONE' if $trace;
232:             #push @{ $self->{'parse queue'} }, ('end','','');
233:             push @{ $self->[&IDX_parse_queue()] }, ('','conforming','');
234:             $parser->parse_done(); #ERROR CODE?
235:             $self->[&IDX_parser()] = ''; # Expat kills itself here, so lose reference
236:             return 0;
237:         }
238:     }
239:     return 1;
240: }


243: # object, name, [name,val, ...]
244: sub handle_start {
245:     my ($parser,$element,@attrs) = @_;
246:     my  $spec_count = ($parser->specified_attr || 0) / 2; # total count
247:     warn "START:$element:[[$spec_count]]:".join(':',@attrs)."\n" if $trace_hnd;
248:     my $self = $parser->{__PACKAGE__};
249:     warn '--HANDLE_START from '.join(', line ',(caller())[1,2])."\n" if $trace;

251:     my @names = ();
252:     for(my $i=0; $i<@attrs; $i+=2) {
253:         #last unless $i < 2*$spec_count;
254:         push @names, $attrs[$i]; # attr name
255:     }
256:     my $obj = {'Type' => 'Element',
257:                'Name' => $element,
258:                'Attributes' => { @attrs },
259:                'AttributeOrder' => [ @names ],
260:                'Defaulted' => $spec_count };

262:     push @{$self->[&IDX_parse_queue()]}, ('start_element',
263:                                           $obj,
264:                                           '',);
265: }

267: # obj, name
268: sub handle_end {
269:     my ($parser, $element) = @_;
270:     warn "END:$element\n" if $trace_hnd;
271:     my $self = $parser->{__PACKAGE__};
272:     push @{$self->[&IDX_parse_queue()]},('end_element',
273:                                          scalar($self->nsname($element)),
274:                                          '');
275:     #$parser->original_string()];
276: }

278: #obj, utf8-string
279: sub handle_char {
280:     my ($parser,$text) = @_;
281:     warn "CHAR:$text\n" if $trace_hnd;
282:     my $self = $parser->{__PACKAGE__};
283:     #$text=$self->encode($text);
284:     my $queue = $self->[&IDX_parse_queue()];
285:     #join with previous event if it's text too
286:     #if (@$queue > 0 && $queue->[$#$queue-2] eq 'cdata') {
287:     #    $queue->[$#$queue-1] .= $text;
288:     #    #$self->{'parse queue'}[-1][-1].=$parser->original_string();
289:     #} else {
290:     push @$queue,('cdata',{'Data'=>($text || '')},'');
291:     #}
292: }


295: sub handle_proc {
296:     my ($parser,$target,$value) = @_;
297:     my $self = $parser->{__PACKAGE__};
298:     push @{$self->[&IDX_parse_queue()]},('pi',
299:                                     { 'Target' => ($target || ''), 
300:                                       'Data'   => ($value||'') },
301:                                     '');
302: }

304: sub handle_comment {
305:     my ($parser,$text) = @_;
306:     my $self=$parser->{__PACKAGE__};
307:     push @{$self->[&IDX_parse_queue()]},('comment', {'Data'=>($text||'')}, '');
308: }

310: sub handle_cdata_start {
311:     my ($parser) = @_;
312:     my $self=$parser->{__PACKAGE__};
313:     warn "ignoring cdata end\n" if $trace_hnd;
314:     #push @{$self->{output}},
315: }

317: sub handle_cdata_end {
318:     my ($parser) = @_;
319:     my $self = $parser->{__PACKAGE__};
320:     warn "ignoring cdata end\n" if $trace_hnd;
321:     #push @{$self->{output}},
322: }

324: # any chars that aren't fed to another handler (including markup chars)
325: sub handle_default {
326:     my ($parser,$string) = @_;
327:     my $self = $parser->{__PACKAGE__};
328:     warn "ignoring [".$string."]\n" if $trace_hnd;
329:     #push @{$self->{output}},
330: }




335: sub nsname {
336:     my ($self,$name) = @_;
337:     my $parser = $self->[&IDX_parser()];
338:     my $ns_uri = '';
339:     if ($parser->{'Namespaces'}) {
340:         $ns_uri = $parser->namespace($name) || '';
341:     }
342:     $name = $self->encode($name);
343:     return wantarray ? ($name, $ns_uri) : $name;
344: }

346: # all chars in UTF8 -- so?
347: sub encode {
348:     my ($self,$text)=@_;
349:     return $text;
350:     #if ($self->{latin}) {
351:     #    $text=~s{([\xc0-\xc3])(.)}{
352:     #        my $hi = ord($1);
353:     #        my $lo = ord($2);
354:     #        chr((($hi & 0x03) <<6) | ($lo & 0x3F))
355:     #        }ge;
356:     #}
357:     #$text;
358: }

360: 1;

362: __END__

364:             CdataStart   (Parser)
365:                This is called at the start of a CDATA section.

367:             CdataEnd     (Parser)
368:                This is called at the end of a CDATA section.

370:             Default      (Parser, String)
371:                This is called for any characters that don't have
372:                a registered handler.  This includes both charac­
373:                ters that are part of markup for which no events
374:                are generated (markup declarations) and characters
375:                that could generate events, but for which no han­
376:                dler has been registered.

378:                Whatever the encoding in the original document,
379:                the string is returned to the handler in UTF-8.

381:             Unparsed          (Parser, Entity, Base, Sysid,
382:            Pubid, Notation)
383:                This is called for a declaration of an unparsed
384:                entity. Entity is the name of the entity. Base is
385:                the base to be used for resolving a relative URI.
386:                Sysid is the system id. Pubid is the public id.
387:                Notation is the notation name. Base and Pubid may
388:                be undefined.

390:             Notation          (Parser, Notation, Base, Sysid,
391:            Pubid)
392:                This is called for a declaration of notation.
393:                Notation is the notation name.  Base is the base
394:                to be used for resolving a relative URI. Sysid is
395:                the system id. Pubid is the public id. Base,
396:                Sysid, and Pubid may all be undefined.


399:             ExternEnt         (Parser, Base, Sysid, Pubid)
400:                This is called when an external entity is refer­
401:                enced. Base is the base to be used for resolving a
402:                relative URI. Sysid is the system id. Pubid is the
403:                public id. Base, and Pubid may be undefined.

405:                This handler should either return a string, which
406:                represents the contents of the external entity, or
407:                return an open filehandle that can be read to
408:                obtain the contents of the external entity, or
409:                return undef, which indicates the external entity
410:                couldn't be found and will generate a parse error.

412:                If an open filehandle is returned, it must be
413:                returned as either a glob (*FOO) or as a reference
414:                to a glob (e.g. an instance of IO::Handle).

416:             ExternEntFin      (Parser)
417:                This is called after an external entity has been
418:                parsed. It allows applications to perform cleanup
419:                on actions performed in the above ExternEnt han­
420:                dler.

422:             Entity            (Parser, Name, Val, Sysid, Pubid,
423:            Ndata, IsParam)
424:                This is called when an entity is declared. For
425:                internal entities, the Val parameter will contain
426:                the value and the remaining three parameters will
427:                be undefined. For external entities, the Val
428:                parameter will be undefined, the Sysid parameter
429:                will have the system id, the Pubid parameter will
430:                have the public id if it was provided (it will be
431:                undefined otherwise), the Ndata parameter will
432:                contain the notation for unparsed entities. If
433:                this is a parameter entity declaration, then the
434:                IsParam parameter is true.

436:                Note that this handler and the Unparsed handler
437:                above overlap. If both are set, then this handler
438:                will not be called for unparsed entities.

440:             Element           (Parser, Name, Model)
441:                The element handler is called when an element dec­
442:                laration is found. Name is the element name, and
443:                Model is the content model as an XML::Parser::Con­
444:                tentModel object. See the section on
445:                "XML::Parser::ContentModel Methods" for methods
446:                available for this class.

448:             Attlist           (Parser, Elname, Attname, Type,
449:            Default, Fixed)
450:                This handler is called for each attribute in an
451:                ATTLIST declaration.  So an ATTLIST declaration
452:                that has multiple attributes will generate multi­
453:                ple calls to this handler. The Elname parameter is
454:                the name of the element with which the attribute
455:                is being associated. The Attname parameter is the
456:                name of the attribute. Type is the attribute type,
457:                given as a string. Default is the default value,
458:                which will either be "#REQUIRED", "#IMPLIED" or a
459:                quoted string (i.e. the returned string will begin
460:                and end with a quote character). If Fixed is true,
461:                then this is a fixed attribute.



465:             Doctype           (Parser, Name, Sysid, Pubid,
466:            Internal)
467:                This handler is called for DOCTYPE declarations.
468:                Name is the document type name. Sysid is the sys­
469:                tem id of the document type, if it was provided,
470:                otherwise it's undefined. Pubid is the public id
471:                of the document type, which will be undefined if
472:                no public id was given. Internal will be true or
473:                false, indicating whether or not the doctype dec­
474:                laration contains an internal subset.

476:             DoctypeFin        (Parser)
477:                This handler is called after parsing of the DOC­
478:                TYPE declaration has finished, including any
479:                internal or external DTD declarations.

481:             XMLDecl           (Parser, Version, Encoding, Stan­
482:            dalone)
483:                This handler is called for XML declarations. Ver­
484:                sion is a string containg the version. Encoding is
485:                either undefined or contains an encoding string.
486:                Standalone is either undefined, or true or false.
487:                Undefined indicates that no standalone parameter
488:                was given in the XML declaration. True or false
489:                indicates "yes" or "no" respectively.