#!/bin/bash PROGNAME=$0 usage() { echo " gebruik: $PROGNAME [opties] -x 1..8 -i $PROGNAME [opties] -x 1..8 corpusfile(s) en/of corpusdirectory(s) -x : query 1 : nieuwe namen 2 : nieuwe woorden 3 : nieuwe woorden met postag en lemma 4 : bestaaande locaties 5 : bestaande personen 6 : bestaande organisaties 7 : bestaande andere namen 8 : nieuwe adjectieven, deelwoorden en werkwoorden -i : kies interactief overige opties: -n int : max aantal resultaten -p : gebruik pager -s : tel hits één keer per bericht -v : verbose " exit } SINGLE=0 SELECT=0 USEPAGER=0 LIMIT=0 VERBOSE=0 XN='' XVALID=0 while getopts 'sin:pvx:' opt do case "$opt" in i) SELECT=1 ;; n) LIMIT="$OPTARG" ;; p) USEPAGER=1 ;; s) SINGLE=1 ;; v) VERBOSE=1 ;; x) XN="$OPTARG" ;; esac done shift "$(($OPTIND -1))" case $XN in 1) # nieuwe namen EXPR='fp://node[((@cat="mwu" and node[@pt="spec"]) or (@pt and @*="eigen" and not(@rel="mwp"))) and not(@his="normal") and not(@his_1="decap" or @his_1="0")]' TEMPLATE='tt:%w' XVALID=1 ;; 2) # nieuwe woorden EXPR='fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' TEMPLATE='tt:%w' XVALID=1 ;; 3) # nieuwe woorden met postag en lemma EXPR='fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' TEMPLATE='tt:%w\t%l\t%P' XVALID=1 ;; 4) # bestaande locaties EXPR='fp://node[(@neclass="LOC" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="LOC"] and @his="normal")]' TEMPLATE='tt:%l' XVALID=1 ;; 5) # bestaande personen EXPR='fp://node[(@neclass="PER" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="PER"] and @his="normal")]' TEMPLATE='tt:%l' XVALID=1 ;; 6) # bestaande organisaties EXPR='fp://node[(@neclass="ORG" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="ORG"] and @his="normal")]' TEMPLATE='tt:%l' XVALID=1 ;; 7) # bestaande andere namen (boeken, films, events, .. ) EXPR='fp://node[(@neclass="MISC" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="MISC"] and @his="normal")]' TEMPLATE='tt:%l' XVALID=1 ;; 8) # nieuwe adjectieven, deelwoorden en werkwoorden EXPR='fp://node[@pt and @his and not(../@his="normal" or @rel="mwp" or ../@his="name" or ../@his_1="decap") and not(@his="normal" or @his="name" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="decap" or @his="within_word_conjunct") and not(@pt="n") ]' TEMPLATE='tt:%w\t%P' XVALID=1 ;; '') ;; *) echo Invalid value for option -x exit ;; esac TAIL='' if [ $LIMIT -gt 0 ] then TAIL=" | head -n $LIMIT" fi if [ $USEPAGER = 1 ] then TAIL="$TAIL | ${PAGER:-less}" fi if [ $SINGLE = 1 ] then SORT="sed -e 's/\.[0-9][0-9]*$//' | sort | uniq | sed -e 's/\(.*\)\t.*/\1\t/'" TEMPLATE="$TEMPLATE"'\t%I' else SORT=sort fi search () { CMD="alto $@ '$EXPR' '$TEMPLATE' | $SORT | uniq -c | sed -e 's/\([0-9]\) */\1\t/' | sort -f -k 2 2> /dev/null | sort -n -r -k 1,1 -s 2> /dev/null | tabulate -s '\t' -f plain 2> /dev/null$TAIL" if [ $VERBOSE = 1 ] then echo "$CMD" fi eval "$CMD" } for i in "$@" do case "$i" in *.xml|*.dact|*.dbxml|*.data.dz|*.index|*.zip) ;; *) if [ ! -d "$i" ] then usage fi ;; esac done if [ $# == 0 -a $SELECT = 1 -a $XVALID = 1 ] then cd /net/corpora/nlnieuws select i in `find . -name '*data.dz' | sort` do search $i done elif [ $# -gt 0 -a $SELECT = 0 -a $XVALID = 1 ] then search "$@" else usage fi