From 2390752ba4be69aa788a43aa95f0b8a3f0030548 Mon Sep 17 00:00:00 2001 From: Peter Kleiweg Date: Tue, 24 Mar 2026 14:04:36 +0100 Subject: [PATCH] update namen.sh: meerdere zoekfuncties --- namen.sh | 101 ++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 85 insertions(+), 16 deletions(-) diff --git a/namen.sh b/namen.sh index 4e62b80..12e9489 100755 --- a/namen.sh +++ b/namen.sh @@ -5,8 +5,19 @@ usage() { echo " gebruik: - $PROGNAME [opties] -i - $PROGNAME [opties] corpusfile(s) en/of corpusdirectory(s) + $PROGNAME [opties] -x 1|2|3|4|5 -i + $PROGNAME [opties] -x 1|2|3|4|5 corpusfile(s) en/of corpusdirectory(s) + + -x : query + + 1 : nieuwe namen + 2 : nieuwe woorden + 3 : nieuwe woorden met postag en lemma + 4 : bestaaande locaties + 5 : bestaande personen + 6 : bestaande organisaties + 7 : bestaande andere namen + 8 : nieuwe adjectieven, deelwoorden en werkwoorden -i : kies interactief @@ -25,7 +36,9 @@ SELECT=0 USEPAGER=0 LIMIT=0 VERBOSE=0 -while getopts 'sin:pv' opt +XN='' +XVALID=0 +while getopts 'sin:pvx:' opt do case "$opt" in i) @@ -40,16 +53,73 @@ do s) SINGLE=1 ;; - v) - VERBOSE=1 - ;; - *) - usage + v) + VERBOSE=1 + ;; + x) + XN="$OPTARG" ;; esac done shift "$(($OPTIND -1))" +case $XN in + 1) + # nieuwe namen + EXPR='fp://node[((@cat="mwu" and node[@pt="spec"]) or (@pt and @*="eigen" and not(@rel="mwp"))) and not(@his="normal") and not(@his_1="decap" or @his_1="0")]' + TEMPLATE='tt:%w' + XVALID=1 + ;; + 2) + # nieuwe woorden + EXPR='fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' + TEMPLATE='tt:%w' + XVALID=1 + ;; + 3) + # nieuwe woorden met postag en lemma + EXPR='fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' + TEMPLATE='tt:%w\t%l\t%P' + XVALID=1 + ;; + 4) + # bestaande locaties + EXPR='fp://node[(@neclass="LOC" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="LOC"] and @his="normal")]' + TEMPLATE='tt:%l' + XVALID=1 + ;; + 5) + # bestaande personen + EXPR='fp://node[(@neclass="PER" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="PER"] and @his="normal")]' + TEMPLATE='tt:%l' + XVALID=1 + ;; + 6) + # bestaande organisaties + EXPR='fp://node[(@neclass="ORG" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="ORG"] and @his="normal")]' + TEMPLATE='tt:%l' + XVALID=1 + ;; + 7) + # bestaande andere namen (boeken, films, events, .. ) + EXPR='fp://node[(@neclass="MISC" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="MISC"] and @his="normal")]' + TEMPLATE='tt:%l' + XVALID=1 + ;; + 8) + # nieuwe adjectieven, deelwoorden en werkwoorden + EXPR='fp://node[@pt and @his and not(../@his="normal" or @rel="mwp" or ../@his="name" or ../@his_1="decap") and not(@his="normal" or @his="name" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="decap" or @his="within_word_conjunct") and not(@pt="n") ]' + TEMPLATE='tt:%w\t%P' + XVALID=1 + ;; + '') + ;; + *) + echo Invalid value for option -x + exit + ;; +esac + TAIL='' if [ $LIMIT -gt 0 ] then @@ -60,22 +130,21 @@ then TAIL="$TAIL | ${PAGER:-less}" fi -EXPR='fp://node[@cat="mwu" and node[@pt="spec"] and not(@his="normal") and not(@his_1="decap")]' if [ $SINGLE = 1 ] then - SORT="sed -e 's/\.[0-9][0-9]*$//' | sort | uniq | sed -e 's/\t.*//'" - TEMPLATE='tt:%w\t%I' + SORT="sed -e 's/\.[0-9][0-9]*$//' | sort | uniq | sed -e 's/\(.*\)\t.*/\1\t/'" + TEMPLATE="$TEMPLATE"'\t%I' else SORT=sort - TEMPLATE='tt:%w' fi search () { + CMD="alto $@ '$EXPR' '$TEMPLATE' | $SORT | uniq -c | sed -e 's/\([0-9]\) */\1\t/' | sort -nr 2> /dev/null | tabulate -s '\t' -f plain 2> /dev/null$TAIL" if [ $VERBOSE = 1 ] then - echo "alto $@ '$EXPR' '$TEMPLATE' | $SORT | uniq -c | sort -nr 2> /dev/null$TAIL" + echo $CMD fi - eval "alto $@ '$EXPR' '$TEMPLATE' | $SORT | uniq -c | sort -nr 2> /dev/null$TAIL" + eval $CMD } for i in "$@" @@ -92,14 +161,14 @@ do esac done -if [ $# == 0 -a $SELECT = 1 ] +if [ $# == 0 -a $SELECT = 1 -a $XVALID = 1 ] then cd /net/corpora/nlnieuws select i in `find . -name '*data.dz' | sort` do search $i done -elif [ $# -gt 0 -a $SELECT = 0 ] +elif [ $# -gt 0 -a $SELECT = 0 -a $XVALID = 1 ] then search "$@" else