Perl NAR Database summary text parser
From MetaBase
Don't say I didn't warn you!
#!/usr/bin/perl -w
use strict;
my $scrapeDir = "Scrape";
my $verbose = 0;
opendir DIR, $scrapeDir
or die "cant open $scrapeDir : $! \n";
my %dbH3;
while(my $file = readdir(DIR)){
unless ($file =~ /^db_accn_(\d{4})$/){
warn "skipping $file\n";
next;
}
my $dbId = $1;
$file = "$scrapeDir/$file";
warn "doing $file\n";
open I, $file
or die "cant open $file : $! \n";
## Ready to parse. Watch out for logic.
## Used for 'pywiki' stuff...
print "<StartPageHere>\n";
my ($bodyFlag, # Are we in the body of the page?
$dbTitleFlag, # Do we have the database title?
$dbTitle, # The database title.
$dbTitleFix, # The database title wihout html.
$dbUrlFlag, # Do we have the database URL?
$dbUrl, # The database URL.
$dbAuthorFlag, #
$dbAuthor, #
$dbAuthorAdrFlag, #
$dbAuthorAdr, #
$dbContactFlag, # Do we have the database contact?
$dbContact, # The database contact.
$dbH3Flag, # Do we have a H3?
$dbH3, # The H3?
$dbAbstractFlag, #
$dbAbstract, #
);
# Used for sanity checking;
my $divCounter = 0;
# Reset the internal 'line counter' per file;
$. = 0;
my (
$dbAuthorX, # Text for 'pywiki' stuff.
$dbUrlX, # Text for 'pywiki' stuff.
$dbContactX, # Text for 'pywiki' stuff.
$dbCatX, # Text for 'pywiki' stuff.
);
while(<I>){
chomp;
# Look for the body...
if(!$bodyFlag){
if (/^\<!-- start body --\>$/){
# Got it
$bodyFlag = 1;
warn "starting at $. '$_'\n"
if $verbose > 0;
}
next;
}
# Keep track of div's (fragile but perhaps enforces 'sanity')
if (/\<div /){$divCounter++}
# Look for the dbTitle...
if (!$dbTitleFlag){
if (/^\<h1 class=\"summary\"\>(.*)\<\/h1\>$/){
$dbTitle = $1;
$dbTitleFlag = 1;
warn join("\t", $divCounter, $dbTitle), "\n"
if $verbose > 1;
$dbTitleFix = $dbTitle;
$dbTitle =~ s/\<I\>(.*)\<\/I\>/''$1''/;
$dbTitleFix =~ s/\<i\>(.*)\<\/i\>/$1/;
$dbTitleFix =~ s/\<I\>(.*)\<\/I\>/$1/;
$dbTitleFix =~ s/\<sup\>(.*)\<\/sup\>/$1/;
$dbTitleFix =~ s/\®\;//;
$dbTitleFix =~ s/\é\;/e/;
$dbTitleFix =~ s/\&apos\;/'/g;
$dbId = sprintf("%d", $dbId);
## Used for 'pywiki' stuff...
print "'''Template:NARDatabase:$dbTitleFix'''\n";
print "<noinclude>\n"; # Text only displayed on the template page
print "{{NARDatabase:Header template |db=$dbTitleFix |dbId=$dbId}}\n";
print "</noinclude>\n";
print "<includeonly>\n"; # Text only displayed on the page which includes the template
print "'''$dbTitle''' is [http://www.oxfordjournals.org/nar/database/summary/$dbId NAR Database No. $dbId].\n";
}
}
# Look for the dbUrl: MULTIPLE!
if (!$dbUrlFlag){
if (/^(?:\<a href=\".*?\"\>.*?\<\/a\>(?: or )?)+$/){
## Used for 'pywiki' stuff...
#print "===External Links===\n";
#$dbUrlX .= "===External Links===\n";
my @url = split/ or /;
for (@url){
if (/\<a href=\"(.*?)\"\>(.*?)\<\/a\>/){
die "WHAT1? : \"$_\"\n"
if $1 ne $2;
$dbUrl = $1;
$dbUrlFlag = 1;
warn join("\t", $divCounter, $dbUrl), "\n"
if $verbose > 1;
## Used for 'pywiki' stuff...
#print "* \[$1 $dbTitle\] homepage\n";
$dbUrlX .= "* \[$1 $dbTitle homepage\]\n";
}
else{
die "WHAT2? : \"$_\"\n";
}
}
next;
}
}
# Grab authors;
if (!$dbAuthorFlag){
if (/^ \<strong\>(.*)\<\/strong\>$/){
$dbAuthor = $1;
$dbAuthorFlag = 1;
warn join("\t", $divCounter, $dbAuthor), "\n"
if $verbose > 1;
## Used for 'pywiki' stuff...
#print "$dbAuthor\n";
$dbAuthorX .= "$dbAuthor\n";
next;
}
}
# Grab author addresses;
if ($dbAuthorFlag && !$dbAuthorAdrFlag){
if (!/^ \<h3 / &&
!/^ \<div / &&
!/^ \<span / &&
!/^ \<\/div\>$/ &&
/^ (.+)$/){
$dbAuthorAdr = $1;
$dbAuthorAdrFlag = 1;
warn join("\t", $divCounter, $dbAuthorAdr), "\n"
if $verbose > 1;
## Used for 'pywiki' stuff...
#print "$dbAuthorAdr\n";
$dbAuthorX .= ":$dbAuthorAdr\n";
next;
}
}
# Grab contact details: MULTIPLE!
if (!$dbContactFlag){
if (/^ \<span class=\"subhead\"\>Contact\<\/span\> (?:\<a href=\"MAILTO:.*?\"\>.*?\<\/a\>(?: or )?)+$/){
$dbContactFlag = 1;
## Missed the boat!
$dbAuthorAdrFlag = 1;
## Used for 'pywiki' stuff...
#print "\n===Contact Email===\n";
#$dbContactX .= "===Contact Email===\n";
my @contact = split/ or /;
for (@contact){
if (/\<a href=\"MAILTO:(.*?)\"\>(.*?)\<\/a\>/){
die "WHAT3? : \"$_\"\n"
if $1 ne $2;
$dbContact = $1;
warn join("\t", $divCounter, $dbContact ), "\n"
if $verbose > 1;
## Used for 'pywiki' stuff...
#print "* EMAIL: $1 \n";
$dbContactX .= "* EMAIL: $1 \n";
}
else{
die "WHAT4? : \"$_\"\n";
}
}
next;
}
}
# Grab (standard?) 'h3' sections.
if (!$dbH3Flag){
if(/^ \<h3 class=\"summary\"\>(.*)\<\/h3\>$/){
$dbH3 = $1;
$dbH3Flag = 1;
warn join("\t", $divCounter, $dbH3 ), "\n"
if $verbose > 1;
## Used for 'pywiki' stuff...
print "\n==$dbH3==\n";
next;
}
}
elsif ($dbH3Flag){
if (/^ \<\/div\>$/){
$dbH3Flag = 0;
}
elsif (!/ \<div/ &&/^ (.*)$/){
$dbH3{$dbH3} = $1;
## Used for 'pywiki' stuff...
if ($dbH3 ne "References"){
my $text = $1;
$text =~ s/\<I\>(.*?)\<\/I\>/''$1''/g;
$text =~ s/\<b\>(.*?)\<\/b\>/'''$1'''/g;
if ( $text =~ /\<a href\=\"MAILTO\:(.*?)\"\>(.*?)\<\/a\>/){
die "WHAT5? : \"$1\" - \"$2\"\n"
if $1 ne $2;
}
$text =~ s/\<a href\=\"MAILTO\:(.*?)\"\>(.*?)\<\/a\>/$1/g;
$text =~ s/\<a href\=\"(.*?)\"\>(.*?)\<\/a\>/[$1 $2]/g;
$text =~ s/\((\d{1,2})\)/([[#References|*$1]])/g;
print "\n$text\n";
next;
}
else{
if (my @ref = (/\<li\>(.*?)\<\/li\>/g)){
for (@ref){
s/^\s*\d{1,2}\.//;
s/\<I\>(.*?)\<\/I\>/''[[Journal:$1|$1]]'',/g;
s/\<b\>(.*?)\<\/b\>/'''($1)'''/g;
s/\((\d\d\d\d)\)/([[PubDate:$1|$1]])/g;
print "# $_\n";
}
next;
}
else{
for(split/\<BR\>|\<br\>|\<p\>/){
s/^\s*\d{1,2}\.//;
s/\<I\>(.*?)\<\/I\>/''[[Journal:$1|$1]]''/g;
s/\<b\>(.*?)\<\/b\>/'''($1)'''/g;
s/\((\d\d\d\d)\)/([[PubDate:$1|$1]])/g;
print "# $_\n";
}
next;
}
}
}
}
# Grab category data
if(/^ Category\: \<a href\=\"\/nar\/database\/cat\/(\d+)\"\>(.*)\<\/a\>$/){
warn join("\t", $divCounter, $1, $2 ), "\n"
if $verbose > 1;
## Used for 'pywiki' stuff...
#print "\n[[Category:NARDatabase:$2|$dbTitleFix]]\n";
$dbCatX .= "\n[[Category:NARDatabase:$2|$dbTitleFix]]\n";
next;
}
if(/^ Subcategory\: \<a href\=\"\/nar\/database\/subcat\/(\d+)\/(\d+)\"\>(.*)\<\/a\>$/){
warn join("\t", $divCounter, $1, $2, $3 ), "\n"
if $verbose > 1;
## Used for 'pywiki' stuff...
#print "[[Category:NARDatabase:$3|$dbTitleFix]]\n";
$dbCatX .= "[[Category:NARDatabase:$3|$dbTitleFix]]\n";
next;
}
## Grab abstract link...
if (!$dbAbstractFlag){
if (/^\<div class\=\"bodytext\">Go to the \<a href\=\"(.*)\"\>abstract\<\/a\> (in the NAR \d\d\d\d Database Issue)\.$/){
$dbAbstract = $1;
$dbAbstractFlag = 1;
warn join("\t", $divCounter, $dbAbstract), "\n"
if $verbose > 1;
## Used for 'pywiki' stuff...
#print "* \[$dbAbstract \]";
$dbUrlX .= "* \[$1 $dbTitle abstract\] $2\n";
next;
}
}
# Are we done here?
if(/\<!-- end body --\>/){
last;
}
}
## Used for 'pywiki' stuff...
print "\n";
print "===Authors===\n", ($dbAuthorX || "* Missing"), "\n\n";
print "===External Links===\n", ($dbUrlX || "* Missing"), "\n";
print "===Contact Email===\n", ($dbContactX || "* Missing"), "\n";
print "\n";
print "[[Category:NARDatabase|$dbTitleFix]]\n",
($dbCatX || "[[Category:NarDatabase:Missing|$dbTitleFix]]"), "\n";
print "</includeonly>\n";
print "<EndPageHere>\n";
#exit;
}
warn "printing H3's\n";
for (keys %dbH3){
warn "$_\n";
}
