NAR Database scrape pipeline notes
From MetaBase
## Sucking the NAR Database Summary text into our MediaWiki;
## PHASE 1)
cd Scrape/
## Step 1) Grab the database summaries from the NAR website using the
## 'database accession codes'. I should ask the NAR people if they
## have this data in a more convenient format! Not that it makes a
## difference in the long run.
for i in `perl -e 'print "$_\n" for 1..1200'`; do
wget http://www.oxfordjournals.org/nar/database/summary/$i;
done
## Step 2) Fix the file names (standardize);
for i in [1-9]*; do
#echo $i;
mv $i db_accn_$i;
done
rename db_accn_ db_accn_0 db_accn_???
rename db_accn_ db_accn_00 db_accn_??
rename db_accn_ db_accn_000 db_accn_?
## Step 3) Grab the database category list;
wget http://www.oxfordjournals.org/nar/database/cap/
## PHASE 1 IS COMPLETE!
cd ../
## PHASE 2) CLEANUP
## Step 1) RUN the 'summary text' parser;
./parse_scrape.plx \
> narDatabaseDataForWiki.dat
## Step 2) Fix an 'encoding' problem;
vi narDatabaseDataForWiki.dat
:set fileencoding=utf-8
:wq
## Step 3) Parse category data;
./parse_scrape.cat.plx \
Scrape/index.html \
> narDatabaseCatDataForWiki.dat
## Step 4) Get a list of database names (for later);
perl -ne '
if (/^\<StartPageHere\>$/){
$i=1;next
}
/...(.*).../;
print "$1\n"
if $i;
$i=0
' narDatabaseDataForWiki.dat \
> narDatabaseDataForWiki.list
## Step 5) Prepare the regular database pages;
./build_db_pages.plx \
narDatabaseDataForWiki.list \
> databaseDataForWiki.dat
## Step 6) Check the list for crap;
perl -ne '
print unless /^(\w|\s|:|\-|\+|\,|\.|\@|\/|\\|\(|\)|\?)+$/;
' narDatabaseDataForWiki.list
## Step 7) Make the 'namespace pages' (for old times sake;
./build_db_namespace_pages.plx \
narDatabaseDataForWiki.list \
> databaseDataForWiki.ns.dat
## PHASE 2 IS COMPLETE!
## PHASE 3) INTERFACE THE WIKI!
cd pywikipedia/
## Step 0) Check 'user-config.py' and 'families/metabase_family.py'
## Test the above config (should only need to run once).
python login.py
## Step 1) Upload the database summary text;
python pagefromfile.py \
-putthrottle:0 \
-log:../narDatabaseDataForWiki.log \
-force \
-notitle \
-start:"<StartPageHere>" \
-end:"<EndPageHere>" \
-file:../narDatabaseDataForWiki.dat
## Step 2) Upload the regular database pages;
python pagefromfile.py \
-putthrottle:0 \
-log:../databaseDataForWiki.log \
-force \
-notitle \
-start:"<pageStart>" \
-end:"<pageEnd>" \
-file:../databaseDataForWiki.dat
## Step 3) Upload the database category data;
python pagefromfile.py \
-putthrottle:0 \
-log:../narDatabaseCatDataForWiki.log \
-force \
-notitle \
-start:"<CatPageStartHere>" \
-end:"<CatPageEndHere>" \
-file:../narDatabaseCatDataForWiki.dat
## Step 4) Upload the database namespace pages;
python pagefromfile.py \
-putthrottle:0 \
-log:../databaseDataForWiki.ns.log \
-force \
-notitle \
-start:"<pageStart>" \
-end:"<pageEnd>" \
-file:../databaseDataForWiki.ns.dat
## PHASE 3 IS COMPLETE!
## PHASE 4?
Marked as
