update namen.sh: meerdere zoekfuncties
This commit is contained in:
101
namen.sh
101
namen.sh
@@ -5,8 +5,19 @@ usage() {
|
|||||||
echo "
|
echo "
|
||||||
gebruik:
|
gebruik:
|
||||||
|
|
||||||
$PROGNAME [opties] -i
|
$PROGNAME [opties] -x 1|2|3|4|5 -i
|
||||||
$PROGNAME [opties] corpusfile(s) en/of corpusdirectory(s)
|
$PROGNAME [opties] -x 1|2|3|4|5 corpusfile(s) en/of corpusdirectory(s)
|
||||||
|
|
||||||
|
-x : query
|
||||||
|
|
||||||
|
1 : nieuwe namen
|
||||||
|
2 : nieuwe woorden
|
||||||
|
3 : nieuwe woorden met postag en lemma
|
||||||
|
4 : bestaaande locaties
|
||||||
|
5 : bestaande personen
|
||||||
|
6 : bestaande organisaties
|
||||||
|
7 : bestaande andere namen
|
||||||
|
8 : nieuwe adjectieven, deelwoorden en werkwoorden
|
||||||
|
|
||||||
-i : kies interactief
|
-i : kies interactief
|
||||||
|
|
||||||
@@ -25,7 +36,9 @@ SELECT=0
|
|||||||
USEPAGER=0
|
USEPAGER=0
|
||||||
LIMIT=0
|
LIMIT=0
|
||||||
VERBOSE=0
|
VERBOSE=0
|
||||||
while getopts 'sin:pv' opt
|
XN=''
|
||||||
|
XVALID=0
|
||||||
|
while getopts 'sin:pvx:' opt
|
||||||
do
|
do
|
||||||
case "$opt" in
|
case "$opt" in
|
||||||
i)
|
i)
|
||||||
@@ -40,16 +53,73 @@ do
|
|||||||
s)
|
s)
|
||||||
SINGLE=1
|
SINGLE=1
|
||||||
;;
|
;;
|
||||||
v)
|
v)
|
||||||
VERBOSE=1
|
VERBOSE=1
|
||||||
;;
|
;;
|
||||||
*)
|
x)
|
||||||
usage
|
XN="$OPTARG"
|
||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
done
|
done
|
||||||
shift "$(($OPTIND -1))"
|
shift "$(($OPTIND -1))"
|
||||||
|
|
||||||
|
case $XN in
|
||||||
|
1)
|
||||||
|
# nieuwe namen
|
||||||
|
EXPR='fp://node[((@cat="mwu" and node[@pt="spec"]) or (@pt and @*="eigen" and not(@rel="mwp"))) and not(@his="normal") and not(@his_1="decap" or @his_1="0")]'
|
||||||
|
TEMPLATE='tt:%w'
|
||||||
|
XVALID=1
|
||||||
|
;;
|
||||||
|
2)
|
||||||
|
# nieuwe woorden
|
||||||
|
EXPR='fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]'
|
||||||
|
TEMPLATE='tt:%w'
|
||||||
|
XVALID=1
|
||||||
|
;;
|
||||||
|
3)
|
||||||
|
# nieuwe woorden met postag en lemma
|
||||||
|
EXPR='fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]'
|
||||||
|
TEMPLATE='tt:%w\t%l\t%P'
|
||||||
|
XVALID=1
|
||||||
|
;;
|
||||||
|
4)
|
||||||
|
# bestaande locaties
|
||||||
|
EXPR='fp://node[(@neclass="LOC" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="LOC"] and @his="normal")]'
|
||||||
|
TEMPLATE='tt:%l'
|
||||||
|
XVALID=1
|
||||||
|
;;
|
||||||
|
5)
|
||||||
|
# bestaande personen
|
||||||
|
EXPR='fp://node[(@neclass="PER" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="PER"] and @his="normal")]'
|
||||||
|
TEMPLATE='tt:%l'
|
||||||
|
XVALID=1
|
||||||
|
;;
|
||||||
|
6)
|
||||||
|
# bestaande organisaties
|
||||||
|
EXPR='fp://node[(@neclass="ORG" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="ORG"] and @his="normal")]'
|
||||||
|
TEMPLATE='tt:%l'
|
||||||
|
XVALID=1
|
||||||
|
;;
|
||||||
|
7)
|
||||||
|
# bestaande andere namen (boeken, films, events, .. )
|
||||||
|
EXPR='fp://node[(@neclass="MISC" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="MISC"] and @his="normal")]'
|
||||||
|
TEMPLATE='tt:%l'
|
||||||
|
XVALID=1
|
||||||
|
;;
|
||||||
|
8)
|
||||||
|
# nieuwe adjectieven, deelwoorden en werkwoorden
|
||||||
|
EXPR='fp://node[@pt and @his and not(../@his="normal" or @rel="mwp" or ../@his="name" or ../@his_1="decap") and not(@his="normal" or @his="name" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="decap" or @his="within_word_conjunct") and not(@pt="n") ]'
|
||||||
|
TEMPLATE='tt:%w\t%P'
|
||||||
|
XVALID=1
|
||||||
|
;;
|
||||||
|
'')
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
echo Invalid value for option -x
|
||||||
|
exit
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
||||||
TAIL=''
|
TAIL=''
|
||||||
if [ $LIMIT -gt 0 ]
|
if [ $LIMIT -gt 0 ]
|
||||||
then
|
then
|
||||||
@@ -60,22 +130,21 @@ then
|
|||||||
TAIL="$TAIL | ${PAGER:-less}"
|
TAIL="$TAIL | ${PAGER:-less}"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
EXPR='fp://node[@cat="mwu" and node[@pt="spec"] and not(@his="normal") and not(@his_1="decap")]'
|
|
||||||
if [ $SINGLE = 1 ]
|
if [ $SINGLE = 1 ]
|
||||||
then
|
then
|
||||||
SORT="sed -e 's/\.[0-9][0-9]*$//' | sort | uniq | sed -e 's/\t.*//'"
|
SORT="sed -e 's/\.[0-9][0-9]*$//' | sort | uniq | sed -e 's/\(.*\)\t.*/\1\t/'"
|
||||||
TEMPLATE='tt:%w\t%I'
|
TEMPLATE="$TEMPLATE"'\t%I'
|
||||||
else
|
else
|
||||||
SORT=sort
|
SORT=sort
|
||||||
TEMPLATE='tt:%w'
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
search () {
|
search () {
|
||||||
|
CMD="alto $@ '$EXPR' '$TEMPLATE' | $SORT | uniq -c | sed -e 's/\([0-9]\) */\1\t/' | sort -nr 2> /dev/null | tabulate -s '\t' -f plain 2> /dev/null$TAIL"
|
||||||
if [ $VERBOSE = 1 ]
|
if [ $VERBOSE = 1 ]
|
||||||
then
|
then
|
||||||
echo "alto $@ '$EXPR' '$TEMPLATE' | $SORT | uniq -c | sort -nr 2> /dev/null$TAIL"
|
echo $CMD
|
||||||
fi
|
fi
|
||||||
eval "alto $@ '$EXPR' '$TEMPLATE' | $SORT | uniq -c | sort -nr 2> /dev/null$TAIL"
|
eval $CMD
|
||||||
}
|
}
|
||||||
|
|
||||||
for i in "$@"
|
for i in "$@"
|
||||||
@@ -92,14 +161,14 @@ do
|
|||||||
esac
|
esac
|
||||||
done
|
done
|
||||||
|
|
||||||
if [ $# == 0 -a $SELECT = 1 ]
|
if [ $# == 0 -a $SELECT = 1 -a $XVALID = 1 ]
|
||||||
then
|
then
|
||||||
cd /net/corpora/nlnieuws
|
cd /net/corpora/nlnieuws
|
||||||
select i in `find . -name '*data.dz' | sort`
|
select i in `find . -name '*data.dz' | sort`
|
||||||
do
|
do
|
||||||
search $i
|
search $i
|
||||||
done
|
done
|
||||||
elif [ $# -gt 0 -a $SELECT = 0 ]
|
elif [ $# -gt 0 -a $SELECT = 0 -a $XVALID = 1 ]
|
||||||
then
|
then
|
||||||
search "$@"
|
search "$@"
|
||||||
else
|
else
|
||||||
|
|||||||
Reference in New Issue
Block a user