#!/bin/sh

# 
#  FASt Term Recognizer
# 
#   fastr/cmd/fastr-free-indexing-fr
#   Version 2.04/04-02-01
# 
#   Copyright (C) 2004  Christian Jacquemin, LIMSI-CNRS
#   BP 133, 91403 ORSAY, FRANCE 
#   tel +33 (0)1 69 85 80 22 / fax -- 80 88
#   http://www.limsi.fr/Individu/jacquemi/
# 
#   Script for free indexing (French)
# 

# Use Option -h to display a help screen.

source /etc/fastr.conf
[ -f $HOME/.fastrrc ] && source $HOME/.fastrrc

# set your default language configuration file for fastr
FASTRCONF=/etc/fastr.conf-fr

export FASTRCONF

############################################################
# The script requires perl (we used version 5.004_04), grep,
# fastr, TreeTagger and the parameter file.
# 
# Input Files have to be in ISO-Latin-1, SGML-Codes (if any)
# should be surrounded by carriage returns.
#
# WHAT THIS SCRIPT DOES:
#
# 1. CORPUS TAGGING:
# The tagging converts French text into a one-word-per-line
# three-column format of TreeTagger
# and converts it into fastr's format.
#
# 2. TERM ACQUISITION:
# A simplistic termer extracts candidate terms, converts
# them into fastr's rules and compiles these rules.
#
# 3. FREE INDEXING
# indexes a corpus with the terms acquired from the corpus
# 
###########################################################################

HELP=0

while getopts hc: myopts
do case $myopts in
   h) HELP=1;;
   c) FASTRCONF="$OPTARG";;
   esac
done
shift `expr $OPTIND - 1`

if [ $# -eq 1 ]
then INPUT=$1
else INPUT="$*"
fi

if [ "$HELP" -gt 0 ] || [ $# -ne 1 ];  then
cat << EOM
SYNTAX:
  fastr-free-indexing-fr [-c 'fastr configuration file'] [input-corpus]
DESCRIPTION:
- corpus tagging
- term acquisition
- controlled indexing
- requires Helmut Schmid TreeTagger
- requires Perl (Tested with gawk version 5.004_04)
OPTIONS:
 -c file   configuration file (Default: $FASTRCONF)
EOM
exit
fi

# 1. CORPUS TAGGING
# please download TreeTagger from
# http://www.ims.uni-stuttgart.de/Tools/DecisionTreeTagger.html
perl -e 'print STDERR "Corpus tagging...\n"'
tree-tagger-french $1 > ${FASTRTMP}/corpus.tag 
# translation of tagged corpus into fastr format
cat ${FASTRTMP}/corpus.tag \
    | ${FASTRBIN}/TreeTaggertoFastr.pl ${FASTR}/lib/TAGS-TreeTagger-fr \
    > ${FASTRTMP}/corpus.fas

# 2. TERM ACQUISITION
perl -e 'print STDERR "Term aquisition and compiling...\n"'
# a simplistic termer
cat ${FASTRTMP}/corpus.tag | ${FASTRBIN}/TermerforFastr.pl -fr | sort -u | \
    perl -ne 'printf "%06d\t%s" , $i++ , $_;' > ${FASTRTMP}/terms.trm
# conversion of terms into single word rules
cat ${FASTRTMP}/terms.trm | ${FASTRBIN}/TermtoRules.pl ${FASTR}/lib/TAGS-TreeTagger-fr \
    | grep '^Word' | sort -u \
    | ${FASTRBIN}/WordtoFamilies.pl ${FASTR}/lib/der-families-fr -links ${FASTR}/lib/sem-links-fr \
    | sort -u > ${FASTRTMP}/terms.R.w
# conversion of terms into term rules
cat ${FASTRTMP}/terms.trm | ${FASTRBIN}/TermtoRules.pl ${FASTR}/lib/TAGS-TreeTagger-fr \
    | grep -v '^Word' > ${FASTRTMP}/terms.R.t

perl -e 'print STDERR "Rule compiling...\n"'
# resets current dictionary
${FASTRBIN}/fastr -C ${FASTRCONF} -z 
# compiles word and term rules
${FASTRBIN}/fastr -C ${FASTRCONF} -c ${FASTRTMP}/terms.R.w
${FASTRBIN}/fastr -C ${FASTRCONF} -c ${FASTRTMP}/terms.R.t

# 3. CORPUS INDEXING:
# indexes a tagged corpus 
# and replaces term identifiers by term strings
perl -e 'print STDERR "Indexing...\n"'
cat ${FASTRTMP}/corpus.fas | \
	${FASTRBIN}/fastr -C ${FASTRCONF} -i | \
	${FASTRBIN}/fastr -C ${FASTRCONF} -s ${FASTRTMP}/terms.trm 2
