#!/bin/sh

# 
#  FASt Term Recognizer
# 
#   fastr/cmd/fastr-controlled-indexing-en
#   Version 2.04/04-02-01
# 
#   Copyright (C) 2004  Christian Jacquemin, LIMSI-CNRS
#   BP 133, 91403 ORSAY, FRANCE 
#   tel +33 (0)1 69 85 80 22 / fax -- 80 88
#   http://www.limsi.fr/Individu/jacquemi/
# 
#   Script for controlled indexing (English)
# 

# Use Option -h to display a help screen.

source /etc/fastr.conf
[ -f $HOME/.fastrrc ] && source $HOME/.fastrrc

# set your default language configuration file for fastr
FASTRCONF=/etc/fastr.conf-en

export FASTRCONF

############################################################
# The script requires perl (we used version 5.004_04), grep,
# fastr, TreeTagger and the parameter file.
# 
# Input Files have to be in ISO-Latin-1, SGML-Codes (if any)
# should be surrounded by carriage returns.
#
# WHAT THIS SCRIPT DOES:
#
# 1. CORPUS TAGGING:
# The tagging converts English text into a one-word-per-line
# three-column format of TreeTagger (PennTreeBank tagset)
# and converts it into fastr's format.
#
# 2. TERM ACQUISITION:
# A simplistic termer extracts candidate terms, converts
# them into fastr's rules and compiles these rules.
#
# 3. CONTROLLED INDEXING
# indexes a corpus with the recycled terms
# 
###########################################################################

HELP=0

while getopts hc: myopts
do case $myopts in
   h) HELP=1;;
   c) FASTRCONF="$OPTARG";;
   esac
done
shift `expr $OPTIND - 1`

if [ $# -eq 1 ]
then INPUT=$1
else INPUT="$*"
fi

if [ "$HELP" -gt 0 ] || [ $# -ne 2 ];  then
cat << EOM
SYNTAX:
  fastr-controlled-indexing-en [-c 'fastr configuration file'] [input-corpus] [terms]
DESCRIPTION:
- corpus tagging
- term acquisition
- controlled indexing
- requires Helmut Schmid TreeTagger
- requires Perl (Tested with gawk version 5.004_04)
OPTIONS:
 -c file   configuration file (Default: $FASTRCONF)
EOM
exit
fi

# 1. CORPUS TAGGING
# please download TreeTagger from
# http://www.ims.uni-stuttgart.de/Tools/DecisionTreeTagger.html
perl -e 'print STDERR "Corpus tagging...\n"'
tree-tagger-english $1 > ${FASTRTMP}/corpus.tag 
# translation of tagged corpus into fastr format
cat ${FASTRTMP}/corpus.tag \
    | ${FASTRBIN}/TreeTaggertoFastr.pl ${FASTR}/lib/TAGS-TreeTagger-en \
    > ${FASTRTMP}/corpus.fas

# 2. TERM RECYCLING
perl -e 'print STDERR "Term recycling and compiling...\n"'
# conversion of terms into standard format
tree-tagger-english $2 > ${FASTRTMP}/terms.tag
cat ${FASTRTMP}/terms.tag | ${FASTRBIN}/TreeTaggertoTerms.pl | \
    perl -ne 'printf "%06d\t%s" , $i++ , $_;' > ${FASTRTMP}/terms.trm
# conversion of terms into single word rules
cat ${FASTRTMP}/terms.trm | ${FASTRBIN}/TermtoRules.pl ${FASTR}/lib/TAGS-TreeTagger-en \
    | grep '^Word' | sort -u \
    | ${FASTRBIN}/WordtoFamilies.pl ${FASTR}/lib/der-families-en -classes ${FASTR}/lib/sem-classes-en \
    | sort -u > ${FASTRTMP}/terms.R.w
#addition of verb 'to be' that appears in metarules
grep "Word 'be' : <cat> = V" ${FASTRTMP}/terms.R.w > $FASTRTMP/tmp.txt
if [ ! -s $FASTRTMP/tmp.txt ]
then echo "Word 'be' : <cat> = V." >> ${FASTRTMP}/terms.R.w
fi
\rm $FASTRTMP/tmp.txt
# conversion of terms into term rules
cat ${FASTRTMP}/terms.trm | ${FASTRBIN}/TermtoRules.pl ${FASTR}/lib/TAGS-TreeTagger-en \
    | grep -v '^Word' > ${FASTRTMP}/terms.R.t

perl -e 'print STDERR "Rule compiling...\n"'
# resets current dictionary
${FASTRBIN}/fastr -C ${FASTREMPTY} -z 
# compiles word and term rules
${FASTRBIN}/fastr -C ${FASTREMPTY} -c ${FASTRTMP}/terms.R.w
${FASTRBIN}/fastr -C ${FASTRCONF} -c ${FASTRTMP}/terms.R.t

# 3. CORPUS INDEXING:
# indexes a tagged corpus 
# and replaces term identifiers by term strings
perl -e 'print STDERR "Indexing...\n"'
cat ${FASTRTMP}/corpus.fas | \
	${FASTRBIN}/fastr -C ${FASTRCONF} -i | \
	${FASTRBIN}/fastr -C ${FASTRCONF} -s ${FASTRTMP}/terms.trm 2
