diff options
Diffstat (limited to 'perlebooks')
-rwxr-xr-x | perlebooks | 190 |
1 files changed, 190 insertions, 0 deletions
diff --git a/perlebooks b/perlebooks new file mode 100755 index 0000000..8a42ebd --- /dev/null +++ b/perlebooks @@ -0,0 +1,190 @@ +#!/usr/bin/env perl + +use strict; +use warnings; +use utf8; + +use Cwd qw(abs_path); +use Encode qw(decode encode); +use File::Basename; +use HTML::Entities; +use IO::File; +use Net::Twitter::Lite::WithAPIv1_1; +use Scalar::Util qw(blessed); +use YAML; + +binmode(STDIN, ':utf8'); +binmode(STDOUT, ':utf8'); +binmode(STDERR, ':utf8'); + +my $dir = dirname(abs_path($0)); +my $CONFFILE = "$dir/conf.yaml"; +my $LASTIDFILE = "$dir/lastid"; +my $CORPUSFILE = "$dir/corpus"; +my $BATCHCOUNT = 200; +my $MAXLEN = 140; + +my %conf = %{YAML::LoadFile($CONFFILE)}; + +my $nt = login($conf{access_secret}, + $conf{access_token}, + $conf{consumer_secret}, + $conf{consumer_key}); +my $lastid = lastid($LASTIDFILE); +$lastid = read_tl($nt, $conf{user}, $lastid, $BATCHCOUNT, $CORPUSFILE); +updatelastid($LASTIDFILE, $lastid) if $lastid; + +my $tweet = gettweet($CORPUSFILE); +utf8::upgrade($tweet); +#print "$tweet\n"; +$nt->update($tweet); + +sub login { + my ($access_secret, $access_token, $consumer_secret, $consumer_key) = @_; + + Net::Twitter::Lite::WithAPIv1_1->new(access_token_secret => $access_secret, + access_token => $access_token, + consumer_secret => $consumer_secret, + consumer_key => $consumer_key, + ssl => 1) || die "Couldn't log in to twitter: $@.\n"; +} + +sub lastid { + my $fn = shift; + + my $id; + my $fh = IO::File->new($fn, 'r'); + if ($fh) { + $id = <$fh>; + chomp $id; + } + + $id; +} + +# TODO: Make atomic. +sub updatelastid { + my ($fn, $id) = @_; + + my $fh = IO::File->new($fn, 'w') || die "Couldn't open $fn for writing: $!.\n"; + print $fh "$id\n"; +} + +sub read_tl { + my ($nt, $user, $id, $count, $corpusfn) = @_; + + my $fh = IO::File->new($corpusfn, 'a') || die "Couldn't open $corpusfn for appending: $!.\n"; + binmode($fh, ':utf8'); + + my $rc = eval { + my $params = {screen_name => $user, + count => $count, + exclude_replies => 1, + trim_user => 1, + include_rts => 0}; + $params->{since_id} = $id if $id; + + my $latestid; + while ((my @statuses = @{$nt->user_timeline($params)}) > 0) { + foreach my $status (@statuses) { + $status->{text} = escape($status->{text}); + print $fh "$status->{id}\t$status->{created_at}\t$status->{text}\n"; + } + $params->{max_id} = $statuses[$#statuses]->{id} - 1; + $latestid = $statuses[0]->{id} unless $latestid; + } + $latestid; + }; + + if (my $err = $@) { + die $@ unless blessed $err && $err->isa('Net::Twitter::Lite::Error'); + + print STDERR $err->twitter_error->{errors}->[0]->{code} . ": " . + $err->twitter_error->{errors}->[0]->{message} . "\n"; + } + + return $rc; +} + +sub gettweet { + my $fn = shift; + + my $fh = IO::File->new($fn, 'r') || die "Couldn't open $fn for reading: $!.\n"; + binmode($fh, ':utf8'); + + my %dict; + while (<$fh>) { + my ($id, $date, $text) = split /\t/; + + $text = unescape($text); + + my @toks = split /\s+/, $text; + for (my $i = 1; $i <= $#toks; $i++) { + my %children = %{$dict{$toks[$i - 1]} || {}}; + $children{$toks[$i]} = ($children{$toks[$i]} || 0) + 1; + $dict{$toks[$i - 1]} = \%children; + } + } + + return genchain($MAXLEN, %dict); +} + +sub genchain { + my ($len, %dict) = @_; + + my $chain; + + my @words = keys %dict; + my $word = @words[rand(@words)]; + + do { + $chain .= ' ' if $chain; + $chain .= $word; + + $word = nextword($word, %dict); + return $chain if defined($word) && length("$chain $word") > $MAXLEN; + } until !defined($word); + + return $chain; +} + +sub nextword { + my ($word, %dict) = @_; + + my %choices = %{$dict{$word} || {}}; + + my @choicearr = map { [$_, $choices{$_}] } keys %choices; + my @sorted = sort { $b->[1] <=> $a->[1] } @choicearr; + + my $total = 0; + foreach my $count (values %choices) { + $total += $count; + } + + my $nextind = rand $total; + foreach my $choice (@choicearr) { + $nextind -= $choice->[1]; + return $choice->[0] unless $nextind > 0; + } +} + +sub escape { + my $str = shift; + + # HTML decoding is one-way, thanks twitter. + $str = decode_entities($str); + + $str =~ s/\\/\\\\/g; + $str =~ s/\n/\\n/g; + $str =~ s/\t/\\t/g; + $str +} + +sub unescape { + my $str = shift; + + $str =~ s/\\n/\n/g; + $str =~ s/\\t/\t/g; + $str =~ s/\\\\/\\/g; + $str; +} |