#!/bin/sh

while [ $# -gt 0 ]
do
  case "$1" in
    -path )
      shift
      ;;
    -* )
      exec >&2
      echo "$0: Unrecognized option $1"
      exit 1
      ;;
    * )
      break
      ;;
  esac
done

if [ "$#" -gt 0 ]
then
  target="$1"
  MASTER=$(cd "$target" && pwd)
  CONFIG=${MASTER}
  shift
else
  if [ -z "${EDIRECT_PUBMED_MASTER}" ]
  then
    echo "Must supply path to master archive area or set EDIRECT_PUBMED_MASTER environment variable"
    exit 1
  else
    MASTER="${EDIRECT_PUBMED_MASTER}"
    MASTER=${MASTER%/}
  fi
fi

while [ $# -gt 0 ]
do
  case "$1" in
    -temp | -work | -working )
      shift
      ;;
    -* )
      exec >&2
      echo "$0: Unrecognized option $1"
      exit 1
      ;;
    * )
      break
      ;;
  esac
done

if [ "$#" -gt 0 ]
then
  working="$1"
  WORKING=$(cd "$working" && pwd)
  shift
else
  if [ -z "${EDIRECT_PUBMED_WORKING}" ]
  then
    WORKING=${MASTER}
  else
    WORKING="${EDIRECT_PUBMED_WORKING}"
    WORKING=${WORKING%/}
  fi
fi

echo "MASTER $MASTER"

echo "WORKING $WORKING"

for dir in Archive Postings
do
  mkdir -p "$MASTER/$dir"
done

for dir in Current Data Indexed Inverted Merged Pubmed
do
  mkdir -p "$WORKING/$dir"
done

date

seconds_start=$(date "+%s")
cd "$WORKING/Data"
echo "Downloading BioC Tables"
download-ncbi-data bioconcepts
echo "Downloading GeneRIFs"
download-ncbi-data generifs
echo "Downloading MeSH Tree"
download-ncbi-data meshtree
echo "Downloading Theme Data"
for bs in \
  "chemical-disease" \
  "chemical-gene" \
  "gene-disease" \
  "gene-gene"
do
  one=$(echo "part-i-$bs-path-theme-distributions.txt")
  if [ ! -f "$one" ]
  then
    if [ ! -f "$one.gz" ]
    then
      echo "part-i-$bs"
      curl -s "https://zenodo.org/record/3459420/files/$one.gz?download=1" > $one.gz
    fi
    gunzip -q "$one.gz"
  fi
  two=$(echo "part-ii-dependency-paths-$bs-sorted-with-themes.txt")
  if [ ! -f "$two" ]
  then
    if [ ! -f "$two.gz" ]
    then
      echo "part-ii-$bs"
      curl -s "https://zenodo.org/record/3459420/files/$two.gz?download=1" > $two.gz
    fi
    gunzip -q "$two.gz"
  fi
done
seconds_end=$(date "+%s")
seconds=$((seconds_end - seconds_start))
echo "$seconds seconds"
DWN=$seconds

seconds_start=$(date "+%s")
echo "Removing Previous Indices"
cd "$WORKING/Indexed"
target="$WORKING/Indexed"
find "$target" -name "*.e2x" -delete
find "$target" -name "*.e2x.gz" -delete
cd "$WORKING/Inverted"
target="$WORKING/Inverted"
find "$target" -name "*.inv" -delete
find "$target" -name "*.inv.gz" -delete
cd "$WORKING/Merged"
target="$WORKING/Merged"
find "$target" -name "*.mrg" -delete
find "$target" -name "*.mrg.gz" -delete
seconds_end=$(date "+%s")
seconds=$((seconds_end - seconds_start))
echo "$seconds seconds"
CLR=$seconds

seconds_start=$(date "+%s")
cd "$WORKING/Indexed"
target="$WORKING/Indexed"
find "$target" -name "*.e2x" -delete
find "$target" -name "*.e2x.gz" -delete
echo "Indexing BioC"
echo "chemical meshname biocchem disease meshname biocdisz gene genename biocgene" |
xargs -n 3 |
while read domain transform prefix
do
  gunzip -c "$WORKING/Data/${domain}2pubtatorcentral.gz" |
  rchive -bioconcepts "$WORKING/Data/$transform.txt" |
  rchive -gzip -thesis 5000000 "$target" "$prefix"
done
echo "Indexing GeneRIFs"
gunzip -c "$WORKING/Data/generifs_basic.gz" |
rchive -generif "$WORKING/Data/genename.txt" |
rchive -gzip -thesis 5000000 "$target" "generifs"
echo "Indexing Themes"
for bs in \
  "chemical-disease" \
  "chemical-gene" \
  "gene-disease" \
  "gene-gene"
do
  one=$(echo "$WORKING/Data/part-i-$bs-path-theme-distributions.txt")
  two=$(echo "$WORKING/Data/part-ii-dependency-paths-$bs-sorted-with-themes.txt")
  echo "$bs" |
  tr '[:lower:]' '[:upper:]' |
  while IFS=$'-' read fs sc
  do
    fs=${fs:0:2}
    sc=${sc:0:2}
    thr=$(echo "$fs$sc")
    rchive -themes "$one" "$two" "$thr"
  done
done > "$target/themes.tbl"
for fld in THME CONV
do
  tag=$(echo "$fld" | tr '[:upper:]' '[:lower:]')
  cat "$target/themes.tbl" |
  grep -w "$fld" |
  rchive -gzip -thesis 5000000 "$target" "$tag"
done
rm "$target/themes.tbl"
echo "Indexing Paths"
for bs in \
  "chemical-disease" \
  "chemical-gene" \
  "gene-disease" \
  "gene-gene"
do
  cat "$WORKING/Data/part-ii-dependency-paths-$bs-sorted-with-themes.txt" |
  rchive -dpaths
done | rchive -gzip -thesis 5000000 "$target" "dpaths"
seconds_end=$(date "+%s")
seconds=$((seconds_end - seconds_start))
echo "$seconds seconds"
IDX=$seconds

seconds_start=$(date "+%s")
cd "$WORKING/Indexed"
echo "Inverting Extra Indices"
target="$WORKING/Inverted"
find "$target" -name "*.inv" -delete
find "$target" -name "*.inv.gz" -delete
for fl in *.e2x.gz
do
  base=${fl%.e2x.gz}
  echo "$base.inv"
  gunzip -c "$fl" |
  rchive -invert |
  gzip -1 > "$target/$base.inv.gz"
  sleep 1
done
seconds_end=$(date "+%s")
seconds=$((seconds_end - seconds_start))
echo "$seconds seconds"
INV=$seconds

seconds_start=$(date "+%s")
cd "$WORKING/Inverted"
echo "Merging Extra Indices"
target="$WORKING/Merged"
find "$target" -name "*.mrg" -delete
find "$target" -name "*.mrg.gz" -delete
osname=`uname -s | sed -e 's/_NT-.*$/_NT/; s/^MINGW[0-9]*/CYGWIN/'`
if [ "$osname" = "CYGWIN_NT" -a -x /bin/cygpath ]
then
  target=`cygpath -w "$target"`
fi
target=${target%/}
rchive -gzip -merge "$target" *.inv.gz
seconds_end=$(date "+%s")
seconds=$((seconds_end - seconds_start))
echo "$seconds seconds"
MRG=$seconds

seconds_start=$(date "+%s")
cd "$WORKING/Merged"
echo "Producing Extra Postings"
target="$WORKING/Postings"
osname=`uname -s | sed -e 's/_NT-.*$/_NT/; s/^MINGW[0-9]*/CYGWIN/'`
if [ "$osname" = "CYGWIN_NT" -a -x /bin/cygpath ]
then
  target=`cygpath -w "$target"`
fi
target=${target%/}
for fld in CHEM DISZ GENE PATH THME CONV
do
  echo "$fld"
  find "." -name "*.mrg.gz" |
  sort |
  xargs -n 100 echo |
  while read files
  do
    rchive -promote "$target" "$fld" $files
  done
done
seconds_end=$(date "+%s")
seconds=$((seconds_end - seconds_start))
echo "$seconds seconds"
PST=$seconds

seconds_start=$(date "+%s")
cd "$WORKING/Indexed"
# echo "Populating Link Archive"
# rchive -distribute "$MASTER/Archive" *.e2x.gz
seconds_end=$(date "+%s")
seconds=$((seconds_end - seconds_start))
# echo "$seconds seconds"
POP=$seconds

echo "DWN $DWN seconds"
echo "CLR $CLR seconds"
echo "IDX $IDX seconds"
echo "INV $INV seconds"
echo "MRG $MRG seconds"
echo "PST $PST seconds"
# echo "POP $POP seconds"

echo ""

date
