NAR Database scrape pipeline notes

From MetaBase

Jump to: navigation, search

## Sucking the NAR Database Summary text into our MediaWiki;

## PHASE 1)

cd Scrape/

## Step 1) Grab the database summaries from the NAR website using the
## 'database accession codes'. I should ask the NAR people if they
## have this data in a more convenient format! Not that it makes a
## difference in the long run.

for i in `perl -e 'print "$_\n" for 1..1200'`; do 
  wget http://www.oxfordjournals.org/nar/database/summary/$i; 
done



## Step 2) Fix the file names (standardize);

for i in [1-9]*; do 
  #echo $i; 
  mv $i db_accn_$i; 
done

rename db_accn_ db_accn_0 db_accn_???
rename db_accn_ db_accn_00 db_accn_??
rename db_accn_ db_accn_000 db_accn_?



## Step 3) Grab the database category list;

wget http://www.oxfordjournals.org/nar/database/cap/



## PHASE 1 IS COMPLETE!

cd ../



## PHASE 2) CLEANUP

## Step 1) RUN the 'summary text' parser;

./parse_scrape.plx \
  > narDatabaseDataForWiki.dat



## Step 2) Fix an 'encoding' problem; 

vi narDatabaseDataForWiki.dat
:set fileencoding=utf-8
:wq



## Step 3) Parse category data;

./parse_scrape.cat.plx \
  Scrape/index.html \
  > narDatabaseCatDataForWiki.dat



## Step 4) Get a list of database names (for later);

perl -ne '
  if (/^\<StartPageHere\>$/){
    $i=1;next
  }
  /...(.*).../; 
  print "$1\n" 
    if $i; 
  $i=0

' narDatabaseDataForWiki.dat \
  > narDatabaseDataForWiki.list



## Step 5) Prepare the regular database pages;

./build_db_pages.plx \
 narDatabaseDataForWiki.list \
  > databaseDataForWiki.dat



## Step 6) Check the list for crap;

perl -ne '
  print unless /^(\w|\s|:|\-|\+|\,|\.|\@|\/|\\|\(|\)|\?)+$/;

' narDatabaseDataForWiki.list



## Step 7) Make the 'namespace pages' (for old times sake;

./build_db_namespace_pages.plx \
 narDatabaseDataForWiki.list \
  > databaseDataForWiki.ns.dat





## PHASE 2 IS COMPLETE!



## PHASE 3) INTERFACE THE WIKI! 

cd pywikipedia/

## Step 0) Check 'user-config.py' and 'families/metabase_family.py'

## Test the above config (should only need to run once).

python login.py	



## Step 1) Upload the database summary text;

python pagefromfile.py \
  -putthrottle:0 \
  -log:../narDatabaseDataForWiki.log \
  -force \
  -notitle \
  -start:"<StartPageHere>" \
  -end:"<EndPageHere>" \
  -file:../narDatabaseDataForWiki.dat



## Step 2) Upload the regular database pages;

python pagefromfile.py \
  -putthrottle:0 \
  -log:../databaseDataForWiki.log \
  -force \
  -notitle \
  -start:"<pageStart>" \
  -end:"<pageEnd>" \
  -file:../databaseDataForWiki.dat



## Step 3)  Upload the database category data;

python pagefromfile.py \
  -putthrottle:0 \
  -log:../narDatabaseCatDataForWiki.log \
  -force \
  -notitle \
  -start:"<CatPageStartHere>" \
  -end:"<CatPageEndHere>" \
  -file:../narDatabaseCatDataForWiki.dat



## Step 4) Upload the database namespace pages;

python pagefromfile.py \
  -putthrottle:0 \
  -log:../databaseDataForWiki.ns.log \
  -force \
  -notitle \
  -start:"<pageStart>" \
  -end:"<pageEnd>" \
  -file:../databaseDataForWiki.ns.dat





## PHASE 3 IS COMPLETE!



## PHASE 4?


Marked as

Personal tools