GSMArena crawler
Jump to navigation
Jump to search
ĮSPĖJIMAS
Scriptas nera uzbaigtas, tai tik tiesiog PoC kaip viskas turi veikti.
FIXME
|
#!/usr/bin/perl
use LWP::UserAgent;
use HTML::TreeBuilder;
use Data::Printer;
use JSON::XS;
binmode STDOUT, ":utf8";
my %data = ();
sub getURL {
my ($url) = @_;
my $ua = LWP::UserAgent->new;
$ua->timeout(10);
$ua->agent($USER_AGENT);
my $response = $ua->get($url);
return $response;
}
sub get_brands {
my $index = 0;
my $response = getURL("http://www.gsmarena.com/makers.php3");
if ($response->is_success) {
my $html = HTML::TreeBuilder->new_from_content($response->decoded_content);
my @ar = $html->look_down(_tag => 'img', height => '22', width => '92', border => '0');
foreach my $i (@ar) {
my $name = $i->{alt};
my $logo = $i->{src};
my $link = $i->parent()->{href};
$data{$index}{name} = $i->{alt};
$data{$index}{logo} = $i->{src};
$data{$index}{link} = $i->parent()->{href};
$index++;
return if $index eq 2; # REMOVE!
}
}
}
my $mm = 0;
sub get_pages {
foreach my $i (keys %data) {
my $response = getURL("http://www.gsmarena.com/$data{$i}{link}");
if ($response->is_success) {
my $html = HTML::TreeBuilder->new_from_content($response->decoded_content);
my $pages = $html->look_down(_tag => 'div', class => 'nav-pages');
my @pages = $pages->look_down(_tag => 'a');
get_models($data{$i}{link}, $i);
foreach my $p (@pages) {
get_models($p->{href}, $i);
$mm = 0;
}
}
}
}
sub get_models {
my ($URL, $ID) = @_;
my $response = getURL("http://www.gsmarena.com/$URL");
if ($response->is_success) {
my $html = HTML::TreeBuilder->new_from_content($response->decoded_content);
my $makers = $html->look_down(_tag => 'div', class => 'makers');
my @li = $makers->look_down(_tag => 'li');
foreach my $i (@li) {
my $link = $i->look_down(_tag => 'a');
my $img = $i->look_down(_tag => 'img');
my $name = $i->look_down(_tag => 'span');
$data{$ID}{models}{$mm}{link} = $link->{href};
$data{$ID}{models}{$mm}{img} = $img->{src};
$data{$ID}{models}{$mm}{title} = $img->{title};
$data{$ID}{models}{$mm}{name} = $name->as_text;
get_model($link->{href}, $ID, $mm);
$mm++;
}
}
}
# http://www.gsmarena.com/alcatel_pop_c9-5938.php
sub get_model {
my ($URL, $M, $A) = @_;
my $response = getURL("http://www.gsmarena.com/$URL");
if ($response->is_success) {
my $html = HTML::TreeBuilder->new_from_content($response->decoded_content);
my $pretable = $html->look_down(_tag => 'div', id => 'specs-list');
my @table = $html->look_down(_tag => 'table', cellspacing=>'0');
foreach my $i (@table) {
my @tr = $i->look_down(_tag => 'tr');
my $item_name = "test";
foreach my $t (@tr) {
my $th = $i->look_down(_tag => 'th');
my @td = $i->look_down(_tag => 'td');
foreach my $d (@td) {
my $foo = "N/A";
my $bar = "N/A";
my $ttl = $d->look_down(_tag => 'td', class => 'ttl');
my $nfo = $d->look_down(_tag => 'td', class => 'nfo');
$foo = $nfo->as_text if $nfo;
$bar = $ttl->as_text if $ttl;
$data{$M}{models}{$A}{specs}{$th->as_text}{$bar}="$foo";
}
}
}
}
}
#get_brands();
#get_pages();
get_model("samsung_galaxy_on7-7679.php", 0, 0);