summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBrian Cully <bjc@kublai.com>2017-04-26 04:47:19 +0000
committerBrian Cully <bjc@kublai.com>2017-04-26 05:41:18 +0000
commit52a0b1df2c4dc14161680936ddaf6b4bd3ea7fe6 (patch)
tree8e53ae20ebeb27400dd77511ac8d27a12df9aa29
downloadperlebooks-master.tar.gz
perlebooks-master.zip
Initial commit.HEADmaster
-rw-r--r--.gitignore3
-rw-r--r--conf.yaml-sample6
-rwxr-xr-xperlebooks190
3 files changed, 199 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..2ee25fd
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+conf.yaml
+corpus
+lastid
diff --git a/conf.yaml-sample b/conf.yaml-sample
new file mode 100644
index 0000000..f39b583
--- /dev/null
+++ b/conf.yaml-sample
@@ -0,0 +1,6 @@
+---
+ user: 'SomeTwitterUser'
+ access_secret: 'YourAccessSecret'
+ access_token: 'YourAccessToken'
+ consumer_key: 'YourConsumerKey'
+ consumer_secret: 'YourConsumerSecret'
diff --git a/perlebooks b/perlebooks
new file mode 100755
index 0000000..8a42ebd
--- /dev/null
+++ b/perlebooks
@@ -0,0 +1,190 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use utf8;
+
+use Cwd qw(abs_path);
+use Encode qw(decode encode);
+use File::Basename;
+use HTML::Entities;
+use IO::File;
+use Net::Twitter::Lite::WithAPIv1_1;
+use Scalar::Util qw(blessed);
+use YAML;
+
+binmode(STDIN, ':utf8');
+binmode(STDOUT, ':utf8');
+binmode(STDERR, ':utf8');
+
+my $dir = dirname(abs_path($0));
+my $CONFFILE = "$dir/conf.yaml";
+my $LASTIDFILE = "$dir/lastid";
+my $CORPUSFILE = "$dir/corpus";
+my $BATCHCOUNT = 200;
+my $MAXLEN = 140;
+
+my %conf = %{YAML::LoadFile($CONFFILE)};
+
+my $nt = login($conf{access_secret},
+ $conf{access_token},
+ $conf{consumer_secret},
+ $conf{consumer_key});
+my $lastid = lastid($LASTIDFILE);
+$lastid = read_tl($nt, $conf{user}, $lastid, $BATCHCOUNT, $CORPUSFILE);
+updatelastid($LASTIDFILE, $lastid) if $lastid;
+
+my $tweet = gettweet($CORPUSFILE);
+utf8::upgrade($tweet);
+#print "$tweet\n";
+$nt->update($tweet);
+
+sub login {
+ my ($access_secret, $access_token, $consumer_secret, $consumer_key) = @_;
+
+ Net::Twitter::Lite::WithAPIv1_1->new(access_token_secret => $access_secret,
+ access_token => $access_token,
+ consumer_secret => $consumer_secret,
+ consumer_key => $consumer_key,
+ ssl => 1) || die "Couldn't log in to twitter: $@.\n";
+}
+
+sub lastid {
+ my $fn = shift;
+
+ my $id;
+ my $fh = IO::File->new($fn, 'r');
+ if ($fh) {
+ $id = <$fh>;
+ chomp $id;
+ }
+
+ $id;
+}
+
+# TODO: Make atomic.
+sub updatelastid {
+ my ($fn, $id) = @_;
+
+ my $fh = IO::File->new($fn, 'w') || die "Couldn't open $fn for writing: $!.\n";
+ print $fh "$id\n";
+}
+
+sub read_tl {
+ my ($nt, $user, $id, $count, $corpusfn) = @_;
+
+ my $fh = IO::File->new($corpusfn, 'a') || die "Couldn't open $corpusfn for appending: $!.\n";
+ binmode($fh, ':utf8');
+
+ my $rc = eval {
+ my $params = {screen_name => $user,
+ count => $count,
+ exclude_replies => 1,
+ trim_user => 1,
+ include_rts => 0};
+ $params->{since_id} = $id if $id;
+
+ my $latestid;
+ while ((my @statuses = @{$nt->user_timeline($params)}) > 0) {
+ foreach my $status (@statuses) {
+ $status->{text} = escape($status->{text});
+ print $fh "$status->{id}\t$status->{created_at}\t$status->{text}\n";
+ }
+ $params->{max_id} = $statuses[$#statuses]->{id} - 1;
+ $latestid = $statuses[0]->{id} unless $latestid;
+ }
+ $latestid;
+ };
+
+ if (my $err = $@) {
+ die $@ unless blessed $err && $err->isa('Net::Twitter::Lite::Error');
+
+ print STDERR $err->twitter_error->{errors}->[0]->{code} . ": " .
+ $err->twitter_error->{errors}->[0]->{message} . "\n";
+ }
+
+ return $rc;
+}
+
+sub gettweet {
+ my $fn = shift;
+
+ my $fh = IO::File->new($fn, 'r') || die "Couldn't open $fn for reading: $!.\n";
+ binmode($fh, ':utf8');
+
+ my %dict;
+ while (<$fh>) {
+ my ($id, $date, $text) = split /\t/;
+
+ $text = unescape($text);
+
+ my @toks = split /\s+/, $text;
+ for (my $i = 1; $i <= $#toks; $i++) {
+ my %children = %{$dict{$toks[$i - 1]} || {}};
+ $children{$toks[$i]} = ($children{$toks[$i]} || 0) + 1;
+ $dict{$toks[$i - 1]} = \%children;
+ }
+ }
+
+ return genchain($MAXLEN, %dict);
+}
+
+sub genchain {
+ my ($len, %dict) = @_;
+
+ my $chain;
+
+ my @words = keys %dict;
+ my $word = @words[rand(@words)];
+
+ do {
+ $chain .= ' ' if $chain;
+ $chain .= $word;
+
+ $word = nextword($word, %dict);
+ return $chain if defined($word) && length("$chain $word") > $MAXLEN;
+ } until !defined($word);
+
+ return $chain;
+}
+
+sub nextword {
+ my ($word, %dict) = @_;
+
+ my %choices = %{$dict{$word} || {}};
+
+ my @choicearr = map { [$_, $choices{$_}] } keys %choices;
+ my @sorted = sort { $b->[1] <=> $a->[1] } @choicearr;
+
+ my $total = 0;
+ foreach my $count (values %choices) {
+ $total += $count;
+ }
+
+ my $nextind = rand $total;
+ foreach my $choice (@choicearr) {
+ $nextind -= $choice->[1];
+ return $choice->[0] unless $nextind > 0;
+ }
+}
+
+sub escape {
+ my $str = shift;
+
+ # HTML decoding is one-way, thanks twitter.
+ $str = decode_entities($str);
+
+ $str =~ s/\\/\\\\/g;
+ $str =~ s/\n/\\n/g;
+ $str =~ s/\t/\\t/g;
+ $str
+}
+
+sub unescape {
+ my $str = shift;
+
+ $str =~ s/\\n/\n/g;
+ $str =~ s/\\t/\t/g;
+ $str =~ s/\\\\/\\/g;
+ $str;
+}