Files
nlnieuws/namen.sh
Peter Kleiweg f9b0e83062 9,10 -> C,T
2026-04-03 13:15:39 +02:00

198 lines
4.8 KiB
Bash
Executable File

#!/bin/bash
PATH=/net/corpora/nlnieuws/bin:$PATH
PROGNAME=$0
usage() {
echo "
gebruik:
$PROGNAME [opties] -x 1..10 -i
$PROGNAME [opties] -x 1..10 corpusfile(s) en/of corpusdirectory(s)
-x : query
1 : nieuwe namen
2 : nieuwe woorden
3 : nieuwe woorden met postag en lemma
4 : bestaaande locaties
5 : bestaande personen
6 : bestaande organisaties
7 : bestaande andere namen
8 : nieuwe adjectieven, deelwoorden en werkwoorden
9 : categorieën
10 : tags
-i : kies interactief
overige opties:
-n int : max aantal resultaten
-p : gebruik pager
-s : tel hits één keer per bericht
-v : verbose
"
exit
}
SINGLE=0
SELECT=0
USEPAGER=0
LIMIT=0
VERBOSE=0
XN=''
XVALID=0
while getopts 'sin:pvx:' opt
do
case "$opt" in
i)
SELECT=1
;;
n)
LIMIT="$OPTARG"
;;
p)
USEPAGER=1
;;
s)
SINGLE=1
;;
v)
VERBOSE=1
;;
x)
XN="$OPTARG"
;;
esac
done
shift "$(($OPTIND -1))"
case $XN in
1)
# nieuwe namen
EXPR='fp://node[((@cat="mwu" and node[@pt="spec"]) or (@pt and @*="eigen" and not(@rel="mwp"))) and not(@his="normal") and not(@his_1="decap" or @his_1="0")]'
TEMPLATE='tt:%w'
XVALID=1
;;
2)
# nieuwe woorden
EXPR='fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]'
TEMPLATE='tt:%w'
XVALID=1
;;
3)
# nieuwe woorden met postag en lemma
EXPR='fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]'
TEMPLATE='tt:%w\t%l\t%P'
XVALID=1
;;
4)
# bestaande en nieuwe locaties
EXPR='fp://node[(@neclass="LOC" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="LOC"])]'
TEMPLATE='tt:%l'
XVALID=1
;;
5)
# bestaande en nieuwe personen
EXPR='fp://node[(@neclass="PER" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="PER"])]'
TEMPLATE='tt:%l'
XVALID=1
;;
6)
# bestaande en nieuwe organisaties
EXPR='fp://node[(@neclass="ORG" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="ORG"])]'
TEMPLATE='tt:%l'
XVALID=1
;;
7)
# bestaande en nieuwe andere namen (boeken, films, events, .. )
EXPR='fp://node[(@neclass="MISC" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="MISC"])]'
TEMPLATE='tt:%l'
XVALID=1
;;
8)
# nieuwe adjectieven, deelwoorden en werkwoorden
EXPR='fp://node[@pt and @his and not(../@his="normal" or @rel="mwp" or ../@his="name" or ../@his_1="decap") and not(@his="normal" or @his="name" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="decap" or @his="within_word_conjunct") and not(@pt="n") ]'
TEMPLATE='tt:%w\t%P'
XVALID=1
;;
9|C)
# categorieën
EXPR='fp://meta[@name="cat"]/@value'
TEMPLATE='tt:%m'
XVALID=1
;;
10|T)
# tags
EXPR='fp://meta[@name="tag"]/@value'
TEMPLATE='tt:%m'
XVALID=1
;;
'')
;;
*)
echo Invalid value for option -x
exit
;;
esac
HEAD=''
TAIL=''
if [ $LIMIT -gt 0 ]
then
HEAD=" | flush | head -n $LIMIT"
else
TAIL=" | flush"
fi
if [ $USEPAGER = 1 ]
then
TAIL="$TAIL | ${PAGER:-less}"
fi
if [ $SINGLE = 1 ]
then
SORT="sed -e 's/\.[0-9][0-9]*$//' | sort | uniq | sed -e 's/\(.*\)\t.*/\1\t/'"
TEMPLATE="$TEMPLATE"'\t%I'
else
SORT=sort
fi
search () {
# $'\t' in bash is een tab
CMD="alto $@ '$EXPR' '$TEMPLATE' | $SORT | uniq -c | sed -e 's/\([0-9]\) */\1\t/' | sort -f -k 2 | sort -n -r -k 1,1 -s$HEAD | column -t -s '"$'\t'"' -c 0$TAIL"
if [ $VERBOSE = 1 ]
then
echo "$CMD"
fi
eval "$CMD"
}
for i in "$@"
do
case "$i" in
*.xml|*.dact|*.dbxml|*.data.dz|*.index|*.zip)
;;
*)
if [ ! -d "$i" ]
then
usage
fi
;;
esac
done
if [ $# == 0 -a $SELECT = 1 -a $XVALID = 1 ]
then
cd /net/corpora/nlnieuws
select i in `find . -name '*data.dz' | sort`
do
search $i
done
elif [ $# -gt 0 -a $SELECT = 0 -a $XVALID = 1 ]
then
search "$@"
else
usage
fi