diff --git a/namen.sh b/namen.sh index 7cca9f7..51351d1 100755 --- a/namen.sh +++ b/namen.sh @@ -1,9 +1,96 @@ #!/bin/bash -cd /net/corpora/nlnieuws +PROGNAME=$0 +usage() { + echo " +gebruik: -select i in `find . -name '*data.dz' | sort` + $PROGNAME [opties] -i + $PROGNAME [opties] file.data.dz ... + + -i : kies interactief + +overige opties: + + -n int : max aantal resultaten + -p : gebruik pager + -s : tel hits één keer per bericht + " + exit +} + +SINGLE=0 +SELECT=0 +USEPAGER=0 +LIMIT=0 +while getopts 'sin:p' opt do - alto "$i" fp:'//node[@cat="mwu" and node[@pt="spec"] and not(@his="normal") and not(@his_1="decap")]' tt:%w | \ - sort | uniq -c | sort -nr | head -n 40 + case "$opt" in + i) + SELECT=1 + ;; + n) + LIMIT="$OPTARG" + ;; + p) + USEPAGER=1 + ;; + s) + SINGLE=1 + ;; + *) + usage + ;; + esac done +shift "$(($OPTIND -1))" + +TAIL='' +if [ $LIMIT -gt 0 ] +then + TAIL=" | head -n $LIMIT" +fi +if [ $USEPAGER = 1 ] +then + TAIL="$TAIL | ${PAGER:-less}" +fi + +EXPR='fp://node[@cat="mwu" and node[@pt="spec"] and not(@his="normal") and not(@his_1="decap")]' +if [ $SINGLE = 1 ] +then + SORT="sed -e 's/\.[0-9][0-9]*$//' | sort | uniq | sed -e 's/\t.*//'" + TEMPLATE='tt:%w\t%I' +else + SORT=sort + TEMPLATE='tt:%w' +fi + +search () { + eval "alto $@ '$EXPR' '$TEMPLATE' | $SORT | uniq -c | sort -nr 2> /dev/null $TAIL" +} + +for i in "$@" +do + case "$i" in + *.data.dz) + ;; + *) + usage + ;; + esac +done + +if [ $# == 0 -a $SELECT = 1 ] +then + cd /net/corpora/nlnieuws + select i in `find . -name '*data.dz' | sort` + do + search $i + done +elif [ $# -gt 0 -a $SELECT = 0 ] +then + search "$@" +else + usage +fi +