Html.pm


NAME

Communiware::Datatype::Html - implements HTML datatype.


SYNOPSIS

Implements datatype which is stored as html, but edited as text, rtf or html.


DESCRIPTION

HTML datatype is most common datatype in Communiware. It can be uploaded as plain text with special emphasis chars, rtf files (only via upload method) and as is. After that it is passed through syntax checker during upload.

img src and link href are preprocessed to find references to other communiware filters. USES argument is added to attrs.


FUNCIONS

save

Distinguishes between HTML and Communiware rich text input.

Calls procedures from Communiware::Format::<etc>

Uses options FORMAT for formatting text (see Communiware::Text docs) NO_LINT to normalize HTML instead of manual correction.

Format defaults to strict paragrapgs.

In case of HTML, calls the do_html manpage routine described below.

upload

Distinguishes between text,html and rtf.

If rtf, then converts it into html and calls the do_html manpage routine Otherwise, reads text in the scalar variable and calls save.

get


Adds correct HTML head. We must process representation validation outside of
this procedure

methods

No additional methods defined

options

return list of options allowed for this content type - FORMAT, NO_LINT Note, it returns names of options as passed in cgi param names.


INTERNAL PROCEDURES

do_html

Recieves item_id, text, list of attributes and flag, indicating that html validation step should be fixed.

Validates (if not forbidden) html, rewrites links to pictures and cmw: scheme urls and returns content of <BODY> tag.

set_errorInfo

syntax

set_errorInfo(filename,linenumber);

Should be called before get or upload methods. Sets file name which should be reported in HTML validator messages and line number which should be added to line number reported by validator.

This information would be cleared out upon completing validation

preprocess_rtf

parameters item_id and filehandle reference Return text in html format. Tries to extract pictures.

normalize_html This sub is an attempt to normalize bad html It takes html text and returns corrected (possibly the same author wanted) version of it. =cut

{ package MyParser; require HTML::Element; use base 'HTML::TreeBuilder';

  use constant DEBUG => 0;
  # Следующий фрагмент (функция text) взят "как есть" из текста модуля
  # HTML::TreeBuilder, за исключением того, что в нем закомментарен фрагмент,
  # помеченный, как *** for Communiware ***, который выполняет "лишнюю"
  # для наших целей раскодировку entities.
  my($indent, $nugget);
  sub text {
    return if $_[0]{'_stunted'};
    
  # Accept a "here's a text token" signal from HTML::Parser.
    my($self, $text, $is_cdata) = @_;
      # the >3.0 versions of Parser may pass a cdata node.
      # Thanks to Gisle Aas for pointing this out.
    
    return unless length $text; # I guess that's always right
    
    my $ignore_text = $self->{'_ignore_text'};
    my $no_space_compacting = $self->{'_no_space_compacting'};
    
    my $pos = $self->{'_pos'} || $self;
    
    # *** for Communiware ***
#    HTML::Entities::decode($text)
#     unless $ignore_text || $is_cdata
#      || $HTML::Tagset::isCDATA_Parent{$pos->{'_tag'}};
    # *** end for Communiware ***
    
    #my($indent, $nugget);
    if(DEBUG) {
      # optimization -- don't figure out depth unless we're in debug mode
      my @lineage_tags = $pos->lineage_tag_names;
      $indent = '  ' x (1 + @lineage_tags);
      
      $nugget = (length($text) <= 25) ? $text : (substr($text,0,25) . '...');
      $nugget =~ s<([\x00-\x1F])>
                 <'\\x'.(unpack("H2",$1))>eg;
      print
        $indent, "Proposing a new text node ($nugget) under ",
        join('/', reverse($pos->{'_tag'}, @lineage_tags)) || 'Root',
        ".\n";
      
    #} else {
    #  $indent = ' ';
    }
    
    
    my $ptag;
    if ($HTML::Tagset::isCDATA_Parent{$ptag = $pos->{'_tag'}}
        #or $pos->is_inside('pre')
        or $pos->is_inside('pre', 'textarea')
    ) {
        return if $ignore_text;
        $pos->push_content($text);
    } else {
        # return unless $text =~ /\S/;  # This is sometimes wrong
        
        if (!$self->{'_implicit_tags'} || $text !~ /\S/) {
            # don't change anything
        } elsif ($ptag eq 'head' or $ptag eq 'noframes') {
            if($self->{'_implicit_body_p_tag'}) {
              print $indent,
                " * Text node under \U$ptag\E closes \U$ptag\E, implicates BODY and P.\n"
               if DEBUG > 1;
              $self->end(\$ptag);
              $pos =
                $self->{'_body'}
                ? ($self->{'_pos'} = $self->{'_body'}) # expected case
                : $self->insert_element('body', 1);
              $pos = $self->insert_element('p', 1);
            } else {
              print $indent,
                " * Text node under \U$ptag\E closes, implicates BODY.\n"
               if DEBUG > 1;
              $self->end(\$ptag);
              $pos =
                $self->{'_body'}
                ? ($self->{'_pos'} = $self->{'_body'}) # expected case
                : $self->insert_element('body', 1);
            }
        } elsif ($ptag eq 'html') {
            if($self->{'_implicit_body_p_tag'}) {
              print $indent,
                " * Text node under HTML implicates BODY and P.\n"
               if DEBUG > 1;
              $pos =
                $self->{'_body'}
                ? ($self->{'_pos'} = $self->{'_body'}) # expected case
                : $self->insert_element('body', 1);
              $pos = $self->insert_element('p', 1);
            } else {
              print $indent,
                " * Text node under HTML implicates BODY.\n"
               if DEBUG > 1;
              $pos =
                $self->{'_body'}
                ? ($self->{'_pos'} = $self->{'_body'}) # expected case
                : $self->insert_element('body', 1);
              #print "POS is $pos, ", $pos->{'_tag'}, "\n";
            }
        } elsif ($ptag eq 'body') {
            if($self->{'_implicit_body_p_tag'}) {
              print $indent,
                " * Text node under BODY implicates P.\n"
               if DEBUG > 1;
              $pos = $self->insert_element('p', 1);
            }
        } elsif ($ptag eq 'table') {
            print $indent,
              " * Text node under TABLE implicates TR and TD.\n"
             if DEBUG > 1;
            $self->insert_element('tr', 1);
            $pos = $self->insert_element('td', 1);
             # double whammy!
        } elsif ($ptag eq 'tr') {
            print $indent,
              " * Text node under TR implicates TD.\n"
             if DEBUG > 1;
            $pos = $self->insert_element('td', 1);
        }
        # elsif (
        #       # $ptag eq 'li'   ||
        #       # $ptag eq 'dd'   ||
        #         $ptag eq 'form') {
        #    $pos = $self->insert_element('p', 1);
        #}
        
        
        # Whatever we've done above should have had the side
        # effect of updating $self->{'_pos'}
        
                
        #print "POS is now $pos, ", $pos->{'_tag'}, "\n";
        
        return if $ignore_text;
        $text =~ s/\s+/ /g unless $no_space_compacting ;  # canonical space
        
        print
          $indent, " (Attaching text node ($nugget) under ",
          # was: $self->{'_pos'} ? $self->{'_pos'}{'_tag'} : $self->{'_tag'},
          $pos->{'_tag'},
          ").\n"
         if DEBUG > 1;
        
        $pos->push_content($text);
    }
    
    &{ $self->{'_tweak_~text'} || return }($text, $pos, $pos->{'_tag'} . '');
     # Note that this is very exceptional -- it doesn't fall back to
     #  _tweak_*, and it gives its tweak different arguments.
    return;
  }
}

sub normalize_html {
local $/;
undef $/;
my $html = shift;
        my $tree = MyParser->new;
        $tree->p_strict(1);
        $tree->store_comments(1);
        $tree->no_space_compacting(1);
        $tree->ignore_ignorable_whitespace(0);
        #    $tree->implicit_body_p_tag(1);
        $tree->parse($html);
        $tree->eof();    #Признак прекращения потокового ввода
        my @tag_list;
        foreach my $forb (@Communiware::Datatype::Html::forbidden_tag) {
                @tag_list = $tree->find_by_tag_name($forb);
                foreach my $cur_tag (@tag_list) {
                        $cur_tag->delete;
                }
        }
        $html = $tree->as_HTML("<>\240", undef, {});
        $tree->delete();
        my $replacement = $Communiware::Charset::replace_string;
        $replacement =~ s/[Ёё]//g;
        $html =~ s/([$replacement])/$Communiware::Charset::escaping{$1}/g;
        return $html;
}

1;

16 октябрь 2007 13:44