GSMArena crawler

Iš Žinynas.
Jump to navigation Jump to search

Scriptas nera uzbaigtas, tai tik tiesiog PoC kaip viskas turi veikti.


ĮSPĖJIMAS
Šis puslapis pažymėtas kaip nebaigtas. Jo turinyje gali trūkti informacijos apie aprašytus informacijos panaudojimus bei esminiai trūkumai kurie gali trūkdyti juos realizuoti.
FIXME
#!/usr/bin/perl
use LWP::UserAgent;
use HTML::TreeBuilder;
use Data::Printer;
use JSON::XS;

binmode STDOUT, ":utf8";

my %data = ();

sub getURL {
    my ($url) = @_;
    my $ua = LWP::UserAgent->new;
    $ua->timeout(10);
    $ua->agent($USER_AGENT);
    my $response = $ua->get($url);
    return $response;
}

sub get_brands {
  my $index = 0;
  my $response = getURL("http://www.gsmarena.com/makers.php3");
  if ($response->is_success) {
    my $html = HTML::TreeBuilder->new_from_content($response->decoded_content);
    my @ar = $html->look_down(_tag => 'img', height => '22', width => '92', border => '0');
    foreach my $i (@ar) {
      my $name = $i->{alt};
      my $logo = $i->{src};
      my $link = $i->parent()->{href};
      $data{$index}{name} = $i->{alt};
      $data{$index}{logo} = $i->{src};
      $data{$index}{link} = $i->parent()->{href};
      $index++;
      
      return if $index eq 2; # REMOVE!
    }
  }
}

my $mm = 0;
sub get_pages {
  foreach my $i (keys %data) {
   my $response = getURL("http://www.gsmarena.com/$data{$i}{link}");
    if ($response->is_success) {
      my $html  = HTML::TreeBuilder->new_from_content($response->decoded_content);
      my $pages = $html->look_down(_tag => 'div', class => 'nav-pages');
      my @pages = $pages->look_down(_tag => 'a');
      get_models($data{$i}{link}, $i);
      foreach my $p (@pages) {
        get_models($p->{href}, $i);
        $mm = 0;
      }
    }
  }
}

sub get_models {
  my ($URL, $ID) = @_;
  my $response = getURL("http://www.gsmarena.com/$URL");
  if ($response->is_success) {
    my $html  = HTML::TreeBuilder->new_from_content($response->decoded_content);
    my $makers = $html->look_down(_tag => 'div', class => 'makers');
    my @li = $makers->look_down(_tag => 'li');
    foreach my $i (@li) {
      my $link  = $i->look_down(_tag => 'a');
      my $img   = $i->look_down(_tag => 'img');
      my $name  = $i->look_down(_tag => 'span');

      $data{$ID}{models}{$mm}{link} = $link->{href};
      $data{$ID}{models}{$mm}{img}  = $img->{src};
      $data{$ID}{models}{$mm}{title}  = $img->{title};
      $data{$ID}{models}{$mm}{name} = $name->as_text;
      
      
      get_model($link->{href}, $ID, $mm);
      $mm++;
    }
  }
}

# http://www.gsmarena.com/alcatel_pop_c9-5938.php
sub get_model {
  my ($URL, $M, $A) = @_;
  my $response = getURL("http://www.gsmarena.com/$URL");
  if ($response->is_success) {
    my $html  = HTML::TreeBuilder->new_from_content($response->decoded_content);
    my $pretable = $html->look_down(_tag => 'div', id => 'specs-list');
    my @table = $html->look_down(_tag => 'table', cellspacing=>'0');
    foreach my $i (@table) {
      my @tr = $i->look_down(_tag => 'tr');
      my $item_name = "test";
      foreach my $t (@tr) {
        my $th = $i->look_down(_tag => 'th');
        my @td = $i->look_down(_tag => 'td');
        foreach my $d (@td) {
          my $foo = "N/A";
          my $bar = "N/A";
          
          my $ttl = $d->look_down(_tag => 'td', class => 'ttl');
          my $nfo = $d->look_down(_tag => 'td', class => 'nfo');
          
          $foo = $nfo->as_text if $nfo;
          $bar = $ttl->as_text if $ttl;

          $data{$M}{models}{$A}{specs}{$th->as_text}{$bar}="$foo";
        } 
      }
    }
  }
}

#get_brands();
#get_pages();


get_model("samsung_galaxy_on7-7679.php", 0, 0);