Perl NAR Database summary text parser

From MetaBase

Jump to: navigation, search

Don't say I didn't warn you!

#!/usr/bin/perl -w

use strict;

my $scrapeDir = "Scrape";
my $verbose   = 0;

opendir DIR, $scrapeDir
  or die "cant open $scrapeDir : $! \n";

my %dbH3;

while(my $file = readdir(DIR)){
  
  unless ($file =~ /^db_accn_(\d{4})$/){
    warn "skipping $file\n";
    next;
  }
  
  my $dbId = $1;
  $file = "$scrapeDir/$file";
  
  warn "doing $file\n";
  
  open I, $file
    or die "cant open $file : $! \n";
  
  ## Ready to parse. Watch out for logic.
  
  ## Used for 'pywiki' stuff...
  print "<StartPageHere>\n";
  
  my ($bodyFlag,	# Are we in the body of the page?
      $dbTitleFlag,	# Do we have the database title?
      $dbTitle,		# The database title.
      $dbTitleFix,	# The database title wihout html.
      $dbUrlFlag,	# Do we have the database URL?
      $dbUrl,		# The database URL.
      $dbAuthorFlag,	#
      $dbAuthor,	#
      $dbAuthorAdrFlag,	#
      $dbAuthorAdr,	#
      $dbContactFlag,	# Do we have the database contact?
      $dbContact,	# The database contact.
      $dbH3Flag,	# Do we have a H3?
      $dbH3,		# The H3?
      $dbAbstractFlag,	#
      $dbAbstract,	#
     );
  
  # Used for sanity checking;
  my $divCounter = 0;
  
  # Reset the internal 'line counter' per file;
  $. = 0;
  
  my (
      $dbAuthorX,	# Text for 'pywiki' stuff.
      $dbUrlX,		# Text for 'pywiki' stuff.
      $dbContactX,	# Text for 'pywiki' stuff.
      $dbCatX,		# Text for 'pywiki' stuff.
     );
  
  while(<I>){
    chomp;
    
    # Look for the body...
    if(!$bodyFlag){
      if (/^\<!-- start body --\>$/){
	# Got it
	$bodyFlag = 1;
	warn "starting at $. '$_'\n"
	  if $verbose > 0;
      }
      next;
    }
    
    # Keep track of div's (fragile but perhaps enforces 'sanity')
    if (/\<div /){$divCounter++}
    
    # Look for the dbTitle...
    if (!$dbTitleFlag){
      if (/^\<h1 class=\"summary\"\>(.*)\<\/h1\>$/){
	$dbTitle = $1;
	$dbTitleFlag = 1;
	warn join("\t", $divCounter, $dbTitle), "\n"
	  if $verbose > 1;
	
	$dbTitleFix = $dbTitle;
	
	$dbTitle    =~ s/\<I\>(.*)\<\/I\>/''$1''/;
	$dbTitleFix =~ s/\<i\>(.*)\<\/i\>/$1/;
	$dbTitleFix =~ s/\<I\>(.*)\<\/I\>/$1/;
	$dbTitleFix =~ s/\<sup\>(.*)\<\/sup\>/$1/;
	$dbTitleFix =~ s/\&reg\;//;
	$dbTitleFix =~ s/\&eacute\;/e/;
	$dbTitleFix =~ s/\&apos\;/'/g;
	$dbId = sprintf("%d", $dbId);
	
	## Used for 'pywiki' stuff...
	
	print "'''Template:NARDatabase:$dbTitleFix'''\n";
	print "<noinclude>\n"; # Text only displayed on the template page
	print "{{NARDatabase:Header template |db=$dbTitleFix |dbId=$dbId}}\n";
	print "</noinclude>\n";
	print "<includeonly>\n"; # Text only displayed on the page which includes the template
	
	print "'''$dbTitle''' is [http://www.oxfordjournals.org/nar/database/summary/$dbId NAR Database No. $dbId].\n";
      }
    }
    
    # Look for the dbUrl: MULTIPLE!
    if (!$dbUrlFlag){
      if (/^(?:\<a href=\".*?\"\>.*?\<\/a\>(?: or )?)+$/){
	
	## Used for 'pywiki' stuff...
	#print "===External Links===\n";
	#$dbUrlX .= "===External Links===\n";
	
	my @url = split/ or /;
	
	for (@url){
	  if (/\<a href=\"(.*?)\"\>(.*?)\<\/a\>/){
	    die "WHAT1? : \"$_\"\n"
	      if $1 ne $2;
	    $dbUrl = $1;
	    $dbUrlFlag = 1;
	    warn join("\t", $divCounter, $dbUrl), "\n"
	      if $verbose > 1;
	    
	    ## Used for 'pywiki' stuff...
	    #print "* \[$1 $dbTitle\] homepage\n";
	    $dbUrlX .= "* \[$1 $dbTitle homepage\]\n";
	  }
	  else{
	    die "WHAT2? : \"$_\"\n";
	  }
	}
	next;
      }
    }
    
    # Grab authors;
    if (!$dbAuthorFlag){
      if (/^  \<strong\>(.*)\<\/strong\>$/){
	$dbAuthor = $1;
	$dbAuthorFlag = 1;
	warn join("\t", $divCounter, $dbAuthor), "\n"
	  if $verbose > 1;
	
	## Used for 'pywiki' stuff...
	#print "$dbAuthor\n";
	$dbAuthorX .= "$dbAuthor\n";
	
	next;
      }
    }
    
    # Grab author addresses;
    if ($dbAuthorFlag && !$dbAuthorAdrFlag){
      if (!/^  \<h3 / &&
	  !/^  \<div / &&
	  !/^  \<span / &&
	  !/^  \<\/div\>$/ &&
	  /^  (.+)$/){
	$dbAuthorAdr = $1;
	$dbAuthorAdrFlag = 1;
	warn join("\t", $divCounter, $dbAuthorAdr), "\n"
	  if $verbose > 1;
	
	## Used for 'pywiki' stuff...
	#print "$dbAuthorAdr\n";
	$dbAuthorX .= ":$dbAuthorAdr\n";

	next;
      }
    }
    
    # Grab contact details: MULTIPLE!
    if (!$dbContactFlag){
      if (/^  \<span class=\"subhead\"\>Contact\<\/span\> (?:\<a href=\"MAILTO:.*?\"\>.*?\<\/a\>(?: or )?)+$/){
	$dbContactFlag = 1;
	
	## Missed the boat!
	$dbAuthorAdrFlag = 1;

	## Used for 'pywiki' stuff...
	#print "\n===Contact Email===\n";
	#$dbContactX .= "===Contact Email===\n";
	
	my @contact = split/ or /;
	
	for (@contact){
	  if (/\<a href=\"MAILTO:(.*?)\"\>(.*?)\<\/a\>/){
	    die "WHAT3? : \"$_\"\n"
	      if $1 ne $2;
	    $dbContact = $1;
	    warn join("\t", $divCounter, $dbContact ), "\n"
	      if $verbose > 1;
	    
	    ## Used for 'pywiki' stuff...
	    #print "* EMAIL: $1 \n";
	    $dbContactX .= "* EMAIL: $1 \n";
	  }
	  else{
	    die "WHAT4? : \"$_\"\n";
	  }
	}
	next;
      }
    }
    
    # Grab (standard?) 'h3' sections.
    if (!$dbH3Flag){
      if(/^  \<h3 class=\"summary\"\>(.*)\<\/h3\>$/){
	$dbH3 = $1;
	$dbH3Flag = 1;
	warn join("\t", $divCounter, $dbH3 ), "\n"
	  if $verbose > 1;
	
	## Used for 'pywiki' stuff...
	print "\n==$dbH3==\n";
	
	next;
      }
    }
    elsif ($dbH3Flag){
      if (/^  \<\/div\>$/){
	$dbH3Flag = 0;
      }
      elsif (!/  \<div/ &&/^  (.*)$/){
	$dbH3{$dbH3} = $1;
	
	## Used for 'pywiki' stuff...
	if ($dbH3 ne "References"){
	  my $text = $1;
	  $text =~ s/\<I\>(.*?)\<\/I\>/''$1''/g;
	  $text =~ s/\<b\>(.*?)\<\/b\>/'''$1'''/g;
	  if ( $text =~ /\<a href\=\"MAILTO\:(.*?)\"\>(.*?)\<\/a\>/){
	    die "WHAT5? : \"$1\" - \"$2\"\n"
	      if $1 ne $2;
	  }
	  $text =~ s/\<a href\=\"MAILTO\:(.*?)\"\>(.*?)\<\/a\>/$1/g;
	  $text =~ s/\<a href\=\"(.*?)\"\>(.*?)\<\/a\>/[$1 $2]/g;
	  $text =~ s/\((\d{1,2})\)/([[#References|*$1]])/g;
	  print "\n$text\n";
	  
	  next;
	}
	else{
	  if (my @ref = (/\<li\>(.*?)\<\/li\>/g)){
	    for (@ref){
	      s/^\s*\d{1,2}\.//;
	      s/\<I\>(.*?)\<\/I\>/''[[Journal:$1|$1]]'',/g;
	      s/\<b\>(.*?)\<\/b\>/'''($1)'''/g;
	      s/\((\d\d\d\d)\)/([[PubDate:$1|$1]])/g;
	      print "# $_\n";
	    }
	    next;
	  }
	  else{
	    for(split/\<BR\>|\<br\>|\<p\>/){
	      s/^\s*\d{1,2}\.//;
	      s/\<I\>(.*?)\<\/I\>/''[[Journal:$1|$1]]''/g;
	      s/\<b\>(.*?)\<\/b\>/'''($1)'''/g;
	      s/\((\d\d\d\d)\)/([[PubDate:$1|$1]])/g;
	      print "# $_\n";
	    }
	    next;
	  }
	}
      }
    }
    
    # Grab category data
    if(/^   Category\: \<a href\=\"\/nar\/database\/cat\/(\d+)\"\>(.*)\<\/a\>$/){
      warn join("\t", $divCounter, $1, $2 ), "\n"
	if $verbose > 1;
      
      ## Used for 'pywiki' stuff...
      #print "\n[[Category:NARDatabase:$2|$dbTitleFix]]\n";
      $dbCatX .= "\n[[Category:NARDatabase:$2|$dbTitleFix]]\n";

      next;
    }
    if(/^      Subcategory\: \<a href\=\"\/nar\/database\/subcat\/(\d+)\/(\d+)\"\>(.*)\<\/a\>$/){
      warn join("\t", $divCounter, $1, $2, $3 ), "\n"
	if $verbose > 1;
      
      ## Used for 'pywiki' stuff...
      #print "[[Category:NARDatabase:$3|$dbTitleFix]]\n";
      $dbCatX .= "[[Category:NARDatabase:$3|$dbTitleFix]]\n";
      
      next;
    }
    
    ## Grab abstract link...
    if (!$dbAbstractFlag){
      if (/^\<div class\=\"bodytext\">Go to the \<a href\=\"(.*)\"\>abstract\<\/a\> (in the NAR \d\d\d\d Database Issue)\.$/){
	$dbAbstract = $1;
	$dbAbstractFlag = 1;
	warn join("\t", $divCounter, $dbAbstract), "\n"
	  if $verbose > 1;
	
	## Used for 'pywiki' stuff...
	#print "* \[$dbAbstract \]";
	$dbUrlX .= "* \[$1 $dbTitle abstract\] $2\n";
	
	next;
      }
    }
    
    
    # Are we done here?
    if(/\<!-- end body --\>/){
      last;
    }
  }
  
  ## Used for 'pywiki' stuff...
  print "\n";
  print "===Authors===\n",      ($dbAuthorX  || "* Missing"), "\n\n";
  print "===External Links===\n", ($dbUrlX     || "* Missing"), "\n";
  print "===Contact Email===\n",  ($dbContactX || "* Missing"), "\n";
  print "\n";
  print "[[Category:NARDatabase|$dbTitleFix]]\n",
    ($dbCatX || "[[Category:NarDatabase:Missing|$dbTitleFix]]"), "\n";
  print "</includeonly>\n";
  print "<EndPageHere>\n";
  
  #exit;
}


warn "printing H3's\n";

for (keys %dbH3){
  warn "$_\n";
}

Personal tools