Jump to content

User:AnomieBOT/source/tasks/TemplateReplacer13.pm: Difference between revisions

From Wikipedia, the free encyclopedia
Content deleted Content added
AnomieBOT (talk | contribs)
Updating published sources: TemplateReplacer13, TemplateReplacer14: * Add some cleanups for misnamed "External links" sections. * Recognize more variations, in particular a level-three heading. General: * Bugfix in rawpage(). * Bugfix in process_para
AnomieBOT (talk | contribs)
Updating published sources: TemplateReplacer13: * <nowiki>Don't consider "http://example.com" and "http://example.com/" to be different.</nowiki>
Line 130: Line 130:
'website' => [
'website' => [
'%X',
'%X',
'%X/',
],
],
'imdb_id' => [
'imdb_id' => [
Line 302: Line 303:
next unless grep($_ eq $param, @process);
next unless grep($_ eq $param, @process);
my $id=$infobox_params{$param};
my $id=$infobox_params{$param};
$id=~s{/$}{};
foreach (@{$ext_links{$param}}){
foreach (@{$ext_links{$param}}){
my $link=$_;
my $link=$_;

Revision as of 01:20, 4 March 2009

package tasks::TemplateReplacer13;

=pod

=begin metadata

Task:    TemplateReplacer13
BRFA:    Wikipedia:Bots/Requests for approval/AnomieBOT 24
Status:  Approved 2009-03-01
Rate:    Max 6 edits/minute
Created: 2009-02-23

Replace the obsolete {{tlx|Infobox Film}} <code>website</code>,
<code>imdb_id</code>, and <code>amg_id</code> parameters with {{tl|official}},
{{tl|imdb title}}, and {{tl|amg movie}} in the External links section,
respectively.

=end metadata

=cut

use utf8;
use strict;

use AnomieBOT::Task;
use Digest::SHA qw/sha256_base64/;
use Data::Dumper;
use vars qw/@ISA/;
@ISA=qw/AnomieBOT::Task/;

my $film_extlink_templates_re=undef;
my $no_edit_just_to_remove_parameters=1;

sub new {
    my $class=shift;
    my $self=$class->SUPER::new();
    bless $self, $class;
    return $self;
}

=pod

=for info
Approved 2009-03-01<br />[[Wikipedia:Bots/Requests for approval/AnomieBOT 24]]

=cut

sub approved {
    return 1;
}

sub run {
    my ($self, $api)=@_;
    my $res;

    $api->task('TemplateReplacer13');
    $api->read_throttle(0);
    $api->edit_throttle(10);

    return 60 if(!defined($self->load_interlanguage_map($api)));

    if(!defined($film_extlink_templates_re)){
        my @links=();
        my $res=$api->query([],
            list        => 'categorymembers',
            cmtitle     => 'Category:Film external link templates',
            cmnamespace => 10,
            cmlimit     => 'max',
        );
        if($res->{'code'} ne 'success'){
            $self->warn("Failed to retrieve film external link template list: ".$res->{'error'}."\n");
            return 60;
        }
        unshift @{$res->{'query'}{'categorymembers'}}, { title=>'Template:Official' };
        foreach (@{$res->{'query'}{'categorymembers'}}){
            my $t=$_->{'title'};
            next if $t=~m{/(?:doc|sandbox)$}i;
            $t="\Q".substr($t,9);
            $t=~s/^(.)/(?i:$1)/;
            $t=~s/ /[ _]/g;
            push @links, $t;
            my $res2=$api->query([],
                list          => 'backlinks',
                bltitle       => $_->{'title'},
                blfilterredir => 'redirects',
                bllimit       => 'max',
            );
            if($res2->{'code'} ne 'success'){
                $self->warn("Failed to retrieve redirects for Template:Infobox Film: ".$res2->{'error'}."\n");
                return 60;
            }
            foreach (@{$res2->{'query'}{'backlinks'}}){
                $_="\Q".substr($_->{'title'},9);
                s/^Template:(.)/(?i:$1)/;
                s/ /[ _]/g;
                push @links, $_;
            }
        }
        $film_extlink_templates_re=join('|', @links);
        $film_extlink_templates_re=qr/{{\s*(?:$film_extlink_templates_re)\s*(?:\||}})/o;
    }

    my $req="[[WP:BOTREQ#IMDb links|request]]";

    # Spend a max of 5 minutes on this task before restarting
    my $endtime=time()+300;

    $self->_output_log($api);

    # Get a list of templates redirecting to our target
    my %templates=();
    $templates{"Template:Infobox Film"}=1;
    $res=$api->query([],
        list          => 'backlinks',
        bltitle       => "Template:Infobox Film",
        blfilterredir => 'redirects',
        bllimit       => 'max',
    );
    if($res->{'code'} ne 'success'){
        $self->warn("Failed to retrieve redirects for Template:Infobox Film: ".$res->{'error'}."\n");
        return 60;
    }
    $templates{$_->{'title'}}=1 foreach (@{$res->{'query'}{'backlinks'}});

    # Matching external links; "%X" is the text of the template parameter
    my @to_process=qw/website imdb_id amg_id/;
    my %ext_links=(
        'website' => [
            '%X',
            '%X/',
        ],
        'imdb_id' => [
            'http://www.imdb.com/title/tt%X',
            'http://www.imdb.com/title/tt%X/',
            'http://imdb.com/title/tt%X',
            'http://imdb.com/title/tt%X/',
        ],
        'amg_id' => [
            'http://allmovie.com/cg/avg.dll?p=avg&sql=%X',
            'http://www.allmovie.com/cg/avg.dll?p=avg&sql=%X',
        ],
    );
    # External link generating templates; parameter 1 is the id, and optional
    # parameter 2 is the infobox's "name" parameter.
    my %ext_templates=(
        'website' => 'official',
        'imdb_id' => 'imdb title',
        'amg_id' => 'amg movie',
    );

    # Get the list of pages to check
    my %q=(
        list        => 'embeddedin',
        eititle     => "Template:Infobox Film",
        einamespace => 0,
        eilimit     => 'max',
    );
    do {
        $res=$api->query(%q);
        if($res->{'code'} ne 'success'){
            $self->warn("Failed to retrieve transclusion list for Template:Infobox Film: ".$res->{'error'}."\n");
            return 60;
        }
        if(exists($res->{'query-continue'})){
            $q{'eicontinue'}=$res->{'query-continue'}{'embeddedin'}{'eicontinue'};
        } else {
            delete $q{'eicontinue'};
        }

        # Process found pages
        foreach (@{$res->{'query'}{'embeddedin'}}){
            my $pageid=$_->{'pageid'};
            next if defined($api->fetch($pageid));

            # Cleanup the log
            my $log=$api->fetch('log');
            $log={} unless defined($log);
            delete $log->{$_}{$pageid} foreach (keys %$log);
            $api->store('log', $log);

            my $title=$_->{'title'};
            $self->warn("Processing $title\n");

            # WTF?
            if(exists($_->{'missing'})){
                $self->warn("$title is missing? WTF?\n");
                next;
            }

            # Ok, check the page
            my $tok=$api->edittoken($title);
            if($tok->{'code'} eq 'shutoff'){
                $self->warn("Task disabled: ".$tok->{'content'}."\n");
                return 300;
            }
            if($tok->{'code'} ne 'success'){
                $self->warn("Failed to get edit token for $title: ".$tok->{'error'}."\n");
                next;
            }
            next if exists($tok->{'missing'});

            # Get page text
            my $intxt=$tok->{'revisions'}[0]{'*'};

            # Step 1: Find the parameters for the infobox. Also, strip the
            # parameters we are intending to process.
            my %infobox_params=();
            my $ct=0;
            my @process=();
            my $outtxt=$self->process_templates($intxt, sub {
                my $name=shift;
                my @params=@{shift()};
                shift; # $wikitext
                shift; # $data
                my $oname=shift;

                return undef unless exists($templates{"Template:$name"});
                if($ct++>0){ # More than one infobox?
                    $self->_log($api, 'Multiple infoboxen', $pageid, $title, "$ct instances of the infobox detected.");
                    $api->store($pageid, \1);
                    return undef;
                }
                my @out=();
                foreach ($self->process_paramlist(@params)){
                    $_->{'value'}=~s/^\s+|\s+$//g;
                    $infobox_params{$_->{'name'}}=$_->{'value'} unless $_->{'value'} eq '';
                    if(exists($ext_links{$_->{'name'}})){
                        push @process, $_->{'name'} unless $_->{'value'} eq '';
                    } else {
                        push @out, $_->{'text'};
                    }
                }
                return "{{$oname|".join("|", @out)."}}";
            });
            next if $ct>1;
            if($ct<1){
                $self->_log($api, 'No infobox', $pageid, $title, "No instance of the infobox was found in the page.");
                $api->store($pageid, \2);
                next;
            }
            unless(@process){
                # Nothing to do here.
                $api->store($pageid, \1000000);
                next;
            }

            # Step 2: Extract the external links section
            my $nowiki;
            ($outtxt,$nowiki)=$self->strip_nowiki($outtxt);
            my @sections=();
            my $extlink_section=undef;
            my $i=0;
            foreach (split /(?=(?:^|\n)==(=?)[^=](?:.*[^=])?\1==\s*\n)/, $outtxt){
                next if ($i++&1)==1; # Odd-numbered indicies are $1 from the pattern above
                my $s=$self->replace_nowiki($_, $nowiki);
                push @sections, \$s;
                s/^(\n?==)(=?)\s*External\s*(\2==(?:\s*<!--.*-->)?\s*(?:\n|$))/$1$2 External links $3/i;
                s/^(\n?==)(=?)(.*)External link\(s\)(.*\2==(?:\s*<!--.*-->)?\s*(?:\n|$))/$1$2$3External links$4/i;
                s/^(\n?==)(=?)(.*)External link((?!s).*\2==(?:\s*<!--.*-->)?\s*(?:\n|$))/$1$2$3External links$4/i;
                s/^(\n?==)(=?)(.*)External references?(.*\2==(?:\s*<!--.*-->)?\s*(?:\n|$))/$1$2$3External links$4/i;
                $extlink_section=\$s if /^\n?==(=?)\s*(?:External links(?: (?:and|&|&amp;) (?:references|sources|resources|further reading))?|(?:References|Sources|Resources) (?:and|&|&amp;) External links)\s*\1==(?:\s*<!--.*-->)?\s*(?:\n|$)/i;
            }
            if(!defined($extlink_section)){
                # Crap, we have to create an external links section.
                $self->_log($api, 'Added "External links"', $pageid, $title, "No \"External links\" section was found in the page. Check if one was added in the right place.");
                my $x=pop @sections;
                my ($pre,$post)=$self->extract_end_content($api, $$x);
                return 60 if(!defined($pre));
                $pre=~s/\s+$/\n/;
                push @sections, \$pre;
                my $dummy="\n== External links ==\n\n";
                $extlink_section=\$dummy;
                push @sections, $extlink_section;
                push @sections, \$post;
            } elsif($extlink_section==$sections[-1]){
                # Last section, strip off the post-content junk
                my $x=pop @sections;
                my ($pre,$post)=$self->extract_end_content($api, $$x);
                return 60 if(!defined($pre));
                $extlink_section=\$pre;
                push @sections, $extlink_section;
                push @sections, \$post;
            }

            # Step 3: Process our parameters
            my $res=$api->query([],
                action=>'parse',
                text=>$$extlink_section,
                prop=>'externallinks',
            );
            if($res->{'code'} ne 'success'){
                $self->warn("Failed to parse external links section for $title: ".$res->{'error'}."\n");
                return 60;
            }
            my @el=();
            @el=@{$res->{'parse'}{'externallinks'}} if exists($res->{'parse'}{'externallinks'});
            my $add='';
            my $has_website=0;
            PARAM:
            foreach my $param (@to_process){
                next unless grep($_ eq $param, @process);
                my $id=$infobox_params{$param};
                $id=~s{/$}{};
                foreach (@{$ext_links{$param}}){
                    my $link=$_;
                    $link=~s/%X/$id/g;
                    next PARAM if grep($_ eq $link, @el);
                }
                my $tmpl=$ext_templates{$param};
                $add.="\n* {{$tmpl|$id";
                $add.='|'.$infobox_params{'name'} if exists($infobox_params{'name'});
                $add.="}}";
                $has_website=1 if $param eq 'website';
            }

            # Step 4: Reassemble the page, if anything changed in step 3
            if($add ne ''){
                if($has_website && $$extlink_section=~s/\n\*/$add\n*/){
                    # Move "website" to the top of the external links
                } elsif($$extlink_section=~s/(\n\*\s*$film_extlink_templates_re.*?)\n/$1$add\n/){
                    # Put it after any other existing film external link templates
                } elsif($$extlink_section=~s/(\s*\n===)/$add$1/){
                    # There is a subsection in there (e.g. "Reviews"), put the
                    # links before it.
                } else {
                    # Just tack it on the end.
                    $$extlink_section=~s/(\s*)$/$add$1/;
                }
                $outtxt=join('', map { $$_ } @sections);
            } elsif($no_edit_just_to_remove_parameters){
                $api->store($pageid, \1000001);
                next;
            } else {
                $outtxt=$self->replace_nowiki($outtxt, $nowiki);
            }

            # Step 5: Perform the edit.
            $process[-1]='and '.$process[-1] if @process>1;
            my $summary='Moving deprecated '.join((@process>2)?', ':' ', @process).' from {{Infobox Film}} to External links per '.$req;
            $self->warn("$summary in $title\n");
            my $r=$api->edit($tok, $outtxt, $summary, 0, 1);
            if($r->{'code'} ne 'success'){
                $self->warn("Write failed on $title: ".$r->{'error'}."\n");
                next;
            }

            # Mark this page as done
            $api->store($pageid, \2000000);

            # If we've been at it long enough, let another task have a go.
            if(time()>=$endtime){
                $self->_output_log($api);
                return 0;
            }
        }
    } while(exists($q{'eicontinue'}));

    # No more pages to check, try again in 10 minutes or so in case of errors.
    $self->_output_log($api);
    return 600;
}

sub _log {
    my $self=shift;
    my $api=shift;
    my $section=shift;
    my $pageid=shift;
    my $title=shift;
    my $message=shift;

    my $log=$api->fetch('log');
    $log={} unless defined($log);
    $log->{$section}={} unless exists($log->{$section});
    $log->{$section}{$pageid}=[$title, $message];
    $api->store('log', $log);
    $self->warn("LOG: $title: $message\n");
}

sub _output_log {
    my $self=shift;
    my $api=shift;

    $self->warn("Updating log");
    my $tok=$api->edittoken("User:AnomieBOT/TemplateReplacer13 log");
    if($tok->{'code'} eq 'shutoff'){
        $self->warn("Task disabled: ".$tok->{'content'}."\n");
        return 300;
    }
    if($tok->{'code'} ne 'success'){
        $self->warn("Failed to get edit token for log: ".$tok->{'error'}."\n");
        return;
    }
    my $header="This is a log of issues encountered during the processing of the task TemplateReplacer13. Do not edit this page, the bot will overwrite it.\n";
    my $intxt=exists($tok->{'missing'})?$header:$tok->{'revisions'}[0]{'*'};
    my $outtxt=$header;
    my $log=$api->fetch('log');
    $log={} unless defined($log);
    foreach my $section (sort keys %$log){
        my @out=();
        foreach my $pageid (keys %{$log->{$section}}){
            next unless defined($api->fetch($pageid));
            my ($title,$message)=@{$log->{$section}{$pageid}};
            push @out, "* [[:$title]]: $message\n";
        }
        next unless @out;
        $outtxt.="\n== $section ==\n".join('', @out) if @out;
    }
    if($outtxt ne $intxt){
        my $r=$api->edit($tok, $outtxt, 'Updating log', 0, 0);
        if($r->{'code'} ne 'success'){
            $self->warn("Could not write log: ".$r->{'error'}."\n");
            return;
        }
    }
}

1;