diff options
Diffstat (limited to 'lib/MediaLibParser.pm')
| -rw-r--r-- | lib/MediaLibParser.pm | 310 |
1 files changed, 310 insertions, 0 deletions
diff --git a/lib/MediaLibParser.pm b/lib/MediaLibParser.pm new file mode 100644 index 0000000..f78ca6f --- /dev/null +++ b/lib/MediaLibParser.pm @@ -0,0 +1,310 @@ +package MediaLibParser;
+
+use strict;
+
+use LWP::UserAgent;
+use HTML::TreeBuilder;
+use Data::Dumper;
+
+our ( @ISA, @EXPORT, $VERSION );
+@ISA = qw(Exporter);
+
+use FindBin qw($RealBin);
+use lib sprintf("%s", $RealBin);
+use lib sprintf("%s/../lib", $RealBin);
+
+@EXPORT = qw(&result_search);
+$VERSION = "0.01";
+
+sub new {
+ my ( $type, %params ) = @_;
+ my $class = ref $type || $type;
+
+ my $self = {};
+ bless $self, $class;
+
+ return $self;
+}
+
+sub url_main { my ( $self, $new ) = @_; $self->{_url_main} = $new if defined $new; return $self->{_url_main}; }
+sub url_results { my ( $self, $new ) = @_; $self->{_url_results} = $new if defined $new; return $self->{_url_main}.$self->{_url_results}; }
+sub url_media { my ( $self, $new ) = @_; $self->{_url_media} = $new if defined $new; return $self->{_url_media}; }
+sub url_actor { my ( $self, $new ) = @_; $self->{_url_actor} = $new if defined $new; return $self->{_url_main}.$self->{_url_actor}; }
+sub method_result { my ( $self, $new ) = @_; $self->{_method_result} = $new if defined $new; return $self->{_method_result}; }
+sub method_media { my ( $self, $new ) = @_; $self->{_method_media} = $new if defined $new; return $self->{_method_media}; }
+sub method_actor { my ( $self, $new ) = @_; $self->{_method_actor} = $new if defined $new; return $self->{_method_actor}; }
+sub lookup_result { my ( $self, $new ) = @_; $self->{_lookup_result} = $new if defined $new; return $self->{_lookup_result}; }
+sub lookup_actor { my ( $self, $new ) = @_; $self->{_lookup_actor} = $new if defined $new; return $self->{_lookup_actor}; }
+sub param_result { my ( $self, $new ) = @_; $self->{_param_result} = $new if defined $new; return $self->{_param_result}; }
+sub param_actor { my ( $self, $new ) = @_; $self->{_param_actor} = $new if defined $new; return $self->{_param_actor}; }
+sub start_result { my ( $self, $new ) = @_; $self->{_start_result} = $new if defined $new; return $self->{_start_result}; }
+sub regex_actor { my ( $self, $new ) = @_; $self->{_regex_actor} = $new if defined $new; return $self->{_regex_actor}; }
+
+sub actor {
+ my $self = shift;
+
+ my $tree = $self->actor_fetch;
+ my $ret = $self->actor_search($tree, $self->actor_search_map);
+}
+
+sub result {
+ my $self = shift;
+
+ my $tree = $self->result_fetch;
+ my @ret = $self->result_search($tree, $self->result_search_map);
+ return @ret;
+}
+
+sub media {
+ my $self = shift;
+
+ my $tree = $self->media_fetch;
+ my $ret = $self->media_search($tree, $self->media_search_map);
+ return $ret;
+}
+
+sub actor_fetch {
+ my $self = shift;
+
+ return $self->_fetch( $self->method_actor, $self->url_actor, $self->param_actor );
+}
+
+sub media_fetch {
+ my $self = shift;
+
+ return $self->_fetch( $self->method_media, $self->url_media );
+}
+
+sub result_fetch {
+ my $self = shift;
+
+ return $self->_fetch( $self->method_result, $self->url_results, $self->param_result )
+}
+
+sub _fetch {
+ my $self = shift;
+ my $method = shift;
+ my $url = shift;
+ my $params = shift;
+
+ my $ua = LWP::UserAgent->new( agent => 'Mozilla/5.0' );
+ $ua->timeout(10);
+
+ my $response;
+ if ( defined $params ) {
+ $response = $ua->$method( $url, $params );
+ } else {
+ $response = $ua->$method( $url );
+ }
+ my $tree= HTML::TreeBuilder->new;
+ if ( $response->is_success ) {
+ $tree->parse($response->content);
+ }
+ #print $tree->as_HTML();
+ return $tree;
+}
+
+sub actor_search {
+ my $self = shift;
+ my $tree = shift;
+ my $as_map = shift;
+
+ my $url;
+
+ if ( defined $as_map->{'subpage'} ) {
+ my ( $address, $href);
+ if ( defined $as_map->{'subpage'}->{'rel_address'} &&
+ ref $tree->address($as_map->{'subpage'}->{'rel_address'}) ) {
+
+ $address = $tree->address($as_map->{'subpage'}->{'rel_address'});
+ } elsif ( defined $as_map->{'subpage'}->{'look_down'} ) {
+
+ if ( defined $as_map->{'subpage'}->{'after_look_down_method'} ) {
+ my $method = $as_map->{'subpage'}->{'after_look_down_method'};
+ $address = $tree->look_down(
+ sub { $_[0]->tag eq "a" && $_[0]->as_text =~ $as_map->{'subpage'}->{'look_down'} }
+ )->$method;
+ } else {
+ $address = $tree->look_down(
+ sub { $_[0]->tag eq "a" && $_[0]->as_text =~ $as_map->{'subpage'}->{'look_down'} }
+ );
+ }
+ }
+ if ( ref $address ) {
+ my $method = $as_map->{'subpage'}->{'method'};
+ my $method_params = $as_map->{'subpage'}->{'method_params'} || undef;
+ if ( $method_params ) {
+ $href = $address->$method($method_params);
+ } else {
+ $href = $address->$method;
+ }
+ if ( defined $href && defined $as_map->{'subpage'}->{'post_proc'} ) {
+ my $coderef = $as_map->{'subpage'}->{'post_proc'};
+ $href = &$coderef( $href );
+ }
+ }
+
+ if ( $href ) {
+ $self->url_media($href);
+ $tree = $tree->delete;
+ $tree = $self->media_fetch;
+ }
+ }
+ my $address;
+ if ( defined $as_map->{'imgurl'}->{'after_look_down_method'} ) {
+ my $method = $as_map->{'imgurl'}->{'after_look_down_method'};
+ $address = $tree->look_down(
+ sub { $_[0]->tag eq "img" && $_[0]->attr('alt') =~ $as_map->{'imgurl'}->{'look_down'} }
+ )->$method;
+ } else {
+ $address = $tree->look_down(
+ sub { $_[0]->tag eq "img" && $_[0]->attr('alt') =~ $as_map->{'imgurl'}->{'look_down'} }
+ );
+ }
+ if ( ref $address ) {
+ my $method = $as_map->{'imgurl'}->{'method'};
+ my $method_params = $as_map->{'imgurl'}->{'method_params'} || undef;
+ if ( $method_params ) {
+ $url = $address->$method($method_params);
+ } else {
+ $url = $address->$method;
+ }
+ if ( defined $url && defined $as_map->{'imgurl'}->{'post_proc'} ) {
+ my $coderef = $as_map->{'imgurl'}->{'post_proc'};
+ $url = &$coderef( $url );
+ }
+ }
+ return $url;
+}
+
+sub media_search {
+ my $self = shift;
+ my $tree = shift;
+ my $ms_map = shift;
+
+ my $ret = {};
+ if ( ref $tree->address($ms_map->{'table'}->{'abs_address'}) ) {
+ my $table = $tree->address($ms_map->{'table'}->{'abs_address'});
+
+ foreach my $key ( keys %$ms_map ) {
+ next if $key eq 'table';
+
+ my $address;
+ if ( defined $ms_map->{$key}->{'rel_address'} &&
+ ref $table->address($ms_map->{$key}->{'rel_address'}) ) {
+
+ $address = $table->address($ms_map->{$key}->{'rel_address'});
+ } elsif ( defined $ms_map->{$key}->{'look_down'} ) {
+
+ my $method = $ms_map->{$key}->{'after_look_down_method'};
+ my $look_address = $table->look_down(
+ sub { $_[0]->tag eq "td" && $_[0]->as_text =~ $ms_map->{$key}->{'look_down'} }
+ );
+ $address = $look_address->$method if ref $look_address
+ }
+ $ret->{$key} = $self->_media_search_field( $address, $ms_map->{$key} ) if ref $address;
+
+ if ( defined $ms_map->{$key}->{'add'} ) {
+ my $add;
+ foreach my $sub ( @{$ms_map->{$key}->{'add'}} ) {
+ my $sub_address;
+ if ( defined $sub->{'rel_address'} && ref $table->address( $sub->{'rel_address'}) ) {
+
+ $sub_address = $table->address( $sub->{'rel_address'});
+ } elsif ( defined $sub->{'look_down'} ) {
+
+ my $method = $sub->{'after_look_down_method'};
+ my $look_sub_address = $table->look_down(
+ sub { $_[0]->tag eq "td" && $_[0]->as_text =~ $sub->{'look_down'} }
+ );
+ $sub_address = $look_sub_address->$method if ref $look_sub_address;
+ }
+ push @$add, $self->_media_search_field( $sub_address, $sub ) if ref $sub_address;
+ }
+ if ( defined $ms_map->{$key}->{'add_proc'} ) {
+ my $coderef = $ms_map->{$key}->{'add_proc'};
+ $ret->{$key} = &$coderef( $ret->{$key}, @$add );
+ }
+ }
+ }
+ }
+ return $ret;
+}
+
+sub _media_search_field {
+ my $self = shift;
+ my $address = shift;
+ my $ms_sub_map = shift;
+ my $ret;
+
+ my $method = $ms_sub_map->{'method'};
+ my $method_params = $ms_sub_map->{'method_params'} || undef;
+ if ( $method_params ) {
+ $ret = $address->$method($method_params);
+ } else {
+ $ret = $address->$method;
+ }
+ if ( defined $ret && defined $ms_sub_map->{'post_proc'} ) {
+ my $coderef = $ms_sub_map->{'post_proc'};
+ $ret = &$coderef( $ret );
+ }
+ return $ret;
+}
+
+sub result_search {
+ my $self = shift;
+ my $tree = shift;
+ my $rs_map = shift;
+
+ my $ret = [];
+ my $resultcount = 0;
+ if ( ref $tree->address($rs_map->{'resultcount'}->{'abs_address'}) ) {
+ $resultcount = $tree->address($rs_map->{'resultcount'}->{'abs_address'})->as_text;
+ }
+ if ( ref $tree->address($rs_map->{'table'}->{'abs_address'}) ) {
+ my $table = $tree->address($rs_map->{'table'}->{'abs_address'});
+ while ( 1 ) {
+ my ($imgurl, $title, $titleurl, $desc);
+ my $entry = $table->address('.'.$rs_map->{'entry'}->{'search_from'}++);
+ if ( ref $entry ) {
+ if ( ref $entry->address($rs_map->{'imgurl'}->{'rel_address'}) ) {
+ $imgurl = $entry->address($rs_map->{'imgurl'}->{'rel_address'})->attr('src');
+ if ( $imgurl && defined $rs_map->{'imgurl'}->{'post_proc'} ) {
+ my $coderef = $rs_map->{'imgurl'}->{'post_proc'};
+ $imgurl = &$coderef( $imgurl );
+ }
+ }
+ if ( ref $entry->address($rs_map->{'title'}->{'rel_address'}) ) {
+ $title = $entry->address($rs_map->{'title'}->{'rel_address'})->as_text;
+ $titleurl = $entry->address($rs_map->{'title'}->{'rel_address'})->attr('href');
+ unless ( $title ) {
+ if ( ref $entry->address($rs_map->{'title'}->{'rel_alternate'}) ) {
+ $title = $entry->address($rs_map->{'title'}->{'rel_alternate'})->as_text;
+ }
+ }
+ if ( $title && defined $rs_map->{'title'}->{'post_proc'} ) {
+ my $coderef = $rs_map->{'title'}->{'post_proc'};
+ $title = &$coderef( $title );
+ }
+ if ( $titleurl && defined $rs_map->{'titleurl'}->{'post_proc'} ) {
+ my $coderef = $rs_map->{'titleurl'}->{'post_proc'};
+ $titleurl = &$coderef( $titleurl );
+ }
+ }
+ if ( ref $entry->address($rs_map->{'desc'}->{'rel_address'}) ) {
+ $desc = $entry->address($rs_map->{'desc'}->{'rel_address'})->clone;
+ if ( $desc && defined $rs_map->{'desc'}->{'post_proc'} ) {
+ my $coderef = $rs_map->{'desc'}->{'post_proc'};
+ $desc = &$coderef( $desc );
+ }
+ }
+ push @$ret, [ $titleurl, $imgurl, $title, $desc ];
+ } else {
+ last;
+ }
+ }
+ }
+ return ($ret, $resultcount);
+
+}
+1;
\ No newline at end of file |
