From bcbf441e09fb502cf64924ff2530fa144bdf52c5 Mon Sep 17 00:00:00 2001 From: Andreas Brachold Date: Mon, 13 Aug 2007 18:41:27 +0000 Subject: * Move files to trunk --- lib/MediaLibParser.pm | 310 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 310 insertions(+) create mode 100644 lib/MediaLibParser.pm (limited to 'lib/MediaLibParser.pm') diff --git a/lib/MediaLibParser.pm b/lib/MediaLibParser.pm new file mode 100644 index 0000000..f78ca6f --- /dev/null +++ b/lib/MediaLibParser.pm @@ -0,0 +1,310 @@ +package MediaLibParser; + +use strict; + +use LWP::UserAgent; +use HTML::TreeBuilder; +use Data::Dumper; + +our ( @ISA, @EXPORT, $VERSION ); +@ISA = qw(Exporter); + +use FindBin qw($RealBin); +use lib sprintf("%s", $RealBin); +use lib sprintf("%s/../lib", $RealBin); + +@EXPORT = qw(&result_search); +$VERSION = "0.01"; + +sub new { + my ( $type, %params ) = @_; + my $class = ref $type || $type; + + my $self = {}; + bless $self, $class; + + return $self; +} + +sub url_main { my ( $self, $new ) = @_; $self->{_url_main} = $new if defined $new; return $self->{_url_main}; } +sub url_results { my ( $self, $new ) = @_; $self->{_url_results} = $new if defined $new; return $self->{_url_main}.$self->{_url_results}; } +sub url_media { my ( $self, $new ) = @_; $self->{_url_media} = $new if defined $new; return $self->{_url_media}; } +sub url_actor { my ( $self, $new ) = @_; $self->{_url_actor} = $new if defined $new; return $self->{_url_main}.$self->{_url_actor}; } +sub method_result { my ( $self, $new ) = @_; $self->{_method_result} = $new if defined $new; return $self->{_method_result}; } +sub method_media { my ( $self, $new ) = @_; $self->{_method_media} = $new if defined $new; return $self->{_method_media}; } +sub method_actor { my ( $self, $new ) = @_; $self->{_method_actor} = $new if defined $new; return $self->{_method_actor}; } +sub lookup_result { my ( $self, $new ) = @_; $self->{_lookup_result} = $new if defined $new; return $self->{_lookup_result}; } +sub lookup_actor { my ( $self, $new ) = @_; $self->{_lookup_actor} = $new if defined $new; return $self->{_lookup_actor}; } +sub param_result { my ( $self, $new ) = @_; $self->{_param_result} = $new if defined $new; return $self->{_param_result}; } +sub param_actor { my ( $self, $new ) = @_; $self->{_param_actor} = $new if defined $new; return $self->{_param_actor}; } +sub start_result { my ( $self, $new ) = @_; $self->{_start_result} = $new if defined $new; return $self->{_start_result}; } +sub regex_actor { my ( $self, $new ) = @_; $self->{_regex_actor} = $new if defined $new; return $self->{_regex_actor}; } + +sub actor { + my $self = shift; + + my $tree = $self->actor_fetch; + my $ret = $self->actor_search($tree, $self->actor_search_map); +} + +sub result { + my $self = shift; + + my $tree = $self->result_fetch; + my @ret = $self->result_search($tree, $self->result_search_map); + return @ret; +} + +sub media { + my $self = shift; + + my $tree = $self->media_fetch; + my $ret = $self->media_search($tree, $self->media_search_map); + return $ret; +} + +sub actor_fetch { + my $self = shift; + + return $self->_fetch( $self->method_actor, $self->url_actor, $self->param_actor ); +} + +sub media_fetch { + my $self = shift; + + return $self->_fetch( $self->method_media, $self->url_media ); +} + +sub result_fetch { + my $self = shift; + + return $self->_fetch( $self->method_result, $self->url_results, $self->param_result ) +} + +sub _fetch { + my $self = shift; + my $method = shift; + my $url = shift; + my $params = shift; + + my $ua = LWP::UserAgent->new( agent => 'Mozilla/5.0' ); + $ua->timeout(10); + + my $response; + if ( defined $params ) { + $response = $ua->$method( $url, $params ); + } else { + $response = $ua->$method( $url ); + } + my $tree= HTML::TreeBuilder->new; + if ( $response->is_success ) { + $tree->parse($response->content); + } + #print $tree->as_HTML(); + return $tree; +} + +sub actor_search { + my $self = shift; + my $tree = shift; + my $as_map = shift; + + my $url; + + if ( defined $as_map->{'subpage'} ) { + my ( $address, $href); + if ( defined $as_map->{'subpage'}->{'rel_address'} && + ref $tree->address($as_map->{'subpage'}->{'rel_address'}) ) { + + $address = $tree->address($as_map->{'subpage'}->{'rel_address'}); + } elsif ( defined $as_map->{'subpage'}->{'look_down'} ) { + + if ( defined $as_map->{'subpage'}->{'after_look_down_method'} ) { + my $method = $as_map->{'subpage'}->{'after_look_down_method'}; + $address = $tree->look_down( + sub { $_[0]->tag eq "a" && $_[0]->as_text =~ $as_map->{'subpage'}->{'look_down'} } + )->$method; + } else { + $address = $tree->look_down( + sub { $_[0]->tag eq "a" && $_[0]->as_text =~ $as_map->{'subpage'}->{'look_down'} } + ); + } + } + if ( ref $address ) { + my $method = $as_map->{'subpage'}->{'method'}; + my $method_params = $as_map->{'subpage'}->{'method_params'} || undef; + if ( $method_params ) { + $href = $address->$method($method_params); + } else { + $href = $address->$method; + } + if ( defined $href && defined $as_map->{'subpage'}->{'post_proc'} ) { + my $coderef = $as_map->{'subpage'}->{'post_proc'}; + $href = &$coderef( $href ); + } + } + + if ( $href ) { + $self->url_media($href); + $tree = $tree->delete; + $tree = $self->media_fetch; + } + } + my $address; + if ( defined $as_map->{'imgurl'}->{'after_look_down_method'} ) { + my $method = $as_map->{'imgurl'}->{'after_look_down_method'}; + $address = $tree->look_down( + sub { $_[0]->tag eq "img" && $_[0]->attr('alt') =~ $as_map->{'imgurl'}->{'look_down'} } + )->$method; + } else { + $address = $tree->look_down( + sub { $_[0]->tag eq "img" && $_[0]->attr('alt') =~ $as_map->{'imgurl'}->{'look_down'} } + ); + } + if ( ref $address ) { + my $method = $as_map->{'imgurl'}->{'method'}; + my $method_params = $as_map->{'imgurl'}->{'method_params'} || undef; + if ( $method_params ) { + $url = $address->$method($method_params); + } else { + $url = $address->$method; + } + if ( defined $url && defined $as_map->{'imgurl'}->{'post_proc'} ) { + my $coderef = $as_map->{'imgurl'}->{'post_proc'}; + $url = &$coderef( $url ); + } + } + return $url; +} + +sub media_search { + my $self = shift; + my $tree = shift; + my $ms_map = shift; + + my $ret = {}; + if ( ref $tree->address($ms_map->{'table'}->{'abs_address'}) ) { + my $table = $tree->address($ms_map->{'table'}->{'abs_address'}); + + foreach my $key ( keys %$ms_map ) { + next if $key eq 'table'; + + my $address; + if ( defined $ms_map->{$key}->{'rel_address'} && + ref $table->address($ms_map->{$key}->{'rel_address'}) ) { + + $address = $table->address($ms_map->{$key}->{'rel_address'}); + } elsif ( defined $ms_map->{$key}->{'look_down'} ) { + + my $method = $ms_map->{$key}->{'after_look_down_method'}; + my $look_address = $table->look_down( + sub { $_[0]->tag eq "td" && $_[0]->as_text =~ $ms_map->{$key}->{'look_down'} } + ); + $address = $look_address->$method if ref $look_address + } + $ret->{$key} = $self->_media_search_field( $address, $ms_map->{$key} ) if ref $address; + + if ( defined $ms_map->{$key}->{'add'} ) { + my $add; + foreach my $sub ( @{$ms_map->{$key}->{'add'}} ) { + my $sub_address; + if ( defined $sub->{'rel_address'} && ref $table->address( $sub->{'rel_address'}) ) { + + $sub_address = $table->address( $sub->{'rel_address'}); + } elsif ( defined $sub->{'look_down'} ) { + + my $method = $sub->{'after_look_down_method'}; + my $look_sub_address = $table->look_down( + sub { $_[0]->tag eq "td" && $_[0]->as_text =~ $sub->{'look_down'} } + ); + $sub_address = $look_sub_address->$method if ref $look_sub_address; + } + push @$add, $self->_media_search_field( $sub_address, $sub ) if ref $sub_address; + } + if ( defined $ms_map->{$key}->{'add_proc'} ) { + my $coderef = $ms_map->{$key}->{'add_proc'}; + $ret->{$key} = &$coderef( $ret->{$key}, @$add ); + } + } + } + } + return $ret; +} + +sub _media_search_field { + my $self = shift; + my $address = shift; + my $ms_sub_map = shift; + my $ret; + + my $method = $ms_sub_map->{'method'}; + my $method_params = $ms_sub_map->{'method_params'} || undef; + if ( $method_params ) { + $ret = $address->$method($method_params); + } else { + $ret = $address->$method; + } + if ( defined $ret && defined $ms_sub_map->{'post_proc'} ) { + my $coderef = $ms_sub_map->{'post_proc'}; + $ret = &$coderef( $ret ); + } + return $ret; +} + +sub result_search { + my $self = shift; + my $tree = shift; + my $rs_map = shift; + + my $ret = []; + my $resultcount = 0; + if ( ref $tree->address($rs_map->{'resultcount'}->{'abs_address'}) ) { + $resultcount = $tree->address($rs_map->{'resultcount'}->{'abs_address'})->as_text; + } + if ( ref $tree->address($rs_map->{'table'}->{'abs_address'}) ) { + my $table = $tree->address($rs_map->{'table'}->{'abs_address'}); + while ( 1 ) { + my ($imgurl, $title, $titleurl, $desc); + my $entry = $table->address('.'.$rs_map->{'entry'}->{'search_from'}++); + if ( ref $entry ) { + if ( ref $entry->address($rs_map->{'imgurl'}->{'rel_address'}) ) { + $imgurl = $entry->address($rs_map->{'imgurl'}->{'rel_address'})->attr('src'); + if ( $imgurl && defined $rs_map->{'imgurl'}->{'post_proc'} ) { + my $coderef = $rs_map->{'imgurl'}->{'post_proc'}; + $imgurl = &$coderef( $imgurl ); + } + } + if ( ref $entry->address($rs_map->{'title'}->{'rel_address'}) ) { + $title = $entry->address($rs_map->{'title'}->{'rel_address'})->as_text; + $titleurl = $entry->address($rs_map->{'title'}->{'rel_address'})->attr('href'); + unless ( $title ) { + if ( ref $entry->address($rs_map->{'title'}->{'rel_alternate'}) ) { + $title = $entry->address($rs_map->{'title'}->{'rel_alternate'})->as_text; + } + } + if ( $title && defined $rs_map->{'title'}->{'post_proc'} ) { + my $coderef = $rs_map->{'title'}->{'post_proc'}; + $title = &$coderef( $title ); + } + if ( $titleurl && defined $rs_map->{'titleurl'}->{'post_proc'} ) { + my $coderef = $rs_map->{'titleurl'}->{'post_proc'}; + $titleurl = &$coderef( $titleurl ); + } + } + if ( ref $entry->address($rs_map->{'desc'}->{'rel_address'}) ) { + $desc = $entry->address($rs_map->{'desc'}->{'rel_address'})->clone; + if ( $desc && defined $rs_map->{'desc'}->{'post_proc'} ) { + my $coderef = $rs_map->{'desc'}->{'post_proc'}; + $desc = &$coderef( $desc ); + } + } + push @$ret, [ $titleurl, $imgurl, $title, $desc ]; + } else { + last; + } + } + } + return ($ret, $resultcount); + +} +1; \ No newline at end of file -- cgit v1.2.3