README
This commit is contained in:
197
query.sh
Executable file
197
query.sh
Executable file
@@ -0,0 +1,197 @@
|
||||
#!/bin/bash
|
||||
|
||||
PATH=/net/corpora/nlnieuws/bin:$PATH
|
||||
|
||||
PROGNAME=$0
|
||||
usage() {
|
||||
echo "
|
||||
gebruik:
|
||||
|
||||
$PROGNAME [opties] -x 1..10 -i
|
||||
$PROGNAME [opties] -x 1..10 corpusfile(s) en/of corpusdirectory(s)
|
||||
|
||||
-x : query
|
||||
|
||||
1 : nieuwe namen
|
||||
2 : nieuwe woorden
|
||||
3 : nieuwe woorden met postag en lemma
|
||||
4 : bestaande locaties
|
||||
5 : bestaande personen
|
||||
6 : bestaande organisaties
|
||||
7 : bestaande andere namen
|
||||
8 : nieuwe adjectieven, deelwoorden en werkwoorden
|
||||
9 : categorieën
|
||||
10 : tags
|
||||
|
||||
-i : kies interactief
|
||||
|
||||
overige opties:
|
||||
|
||||
-n int : max aantal resultaten
|
||||
-p : gebruik pager
|
||||
-s : tel hits één keer per bericht
|
||||
-v : verbose
|
||||
"
|
||||
exit
|
||||
}
|
||||
|
||||
SINGLE=0
|
||||
SELECT=0
|
||||
USEPAGER=0
|
||||
LIMIT=0
|
||||
VERBOSE=0
|
||||
XN=''
|
||||
XVALID=0
|
||||
while getopts 'sin:pvx:' opt
|
||||
do
|
||||
case "$opt" in
|
||||
i)
|
||||
SELECT=1
|
||||
;;
|
||||
n)
|
||||
LIMIT="$OPTARG"
|
||||
;;
|
||||
p)
|
||||
USEPAGER=1
|
||||
;;
|
||||
s)
|
||||
SINGLE=1
|
||||
;;
|
||||
v)
|
||||
VERBOSE=1
|
||||
;;
|
||||
x)
|
||||
XN="$OPTARG"
|
||||
;;
|
||||
esac
|
||||
done
|
||||
shift "$(($OPTIND -1))"
|
||||
|
||||
case $XN in
|
||||
1)
|
||||
# nieuwe namen
|
||||
EXPR='fp://node[((@cat="mwu" and node[@pt="spec"]) or (@pt and @*="eigen" and not(@rel="mwp"))) and not(@his="normal") and not(@his_1="decap" or @his_1="0")]'
|
||||
TEMPLATE='tt:%w'
|
||||
XVALID=1
|
||||
;;
|
||||
2)
|
||||
# nieuwe woorden
|
||||
EXPR='fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="decap" or @his="name" or @his="prefix_name" or @his_1="decap" or @his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]'
|
||||
TEMPLATE='tt:%w'
|
||||
XVALID=1
|
||||
;;
|
||||
3)
|
||||
# nieuwe woorden met postag en lemma
|
||||
EXPR='fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="decap" or @his="name" or @his="prefix_name" or @his_1="decap" or @his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]'
|
||||
TEMPLATE='tt:%w\t%l\t%P'
|
||||
XVALID=1
|
||||
;;
|
||||
4)
|
||||
# bestaande en nieuwe locaties
|
||||
EXPR='fp://node[(@neclass="LOC" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="LOC"])]'
|
||||
TEMPLATE='tt:%l'
|
||||
XVALID=1
|
||||
;;
|
||||
5)
|
||||
# bestaande en nieuwe personen
|
||||
EXPR='fp://node[(@neclass="PER" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="PER"])]'
|
||||
TEMPLATE='tt:%l'
|
||||
XVALID=1
|
||||
;;
|
||||
6)
|
||||
# bestaande en nieuwe organisaties
|
||||
EXPR='fp://node[(@neclass="ORG" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="ORG"])]'
|
||||
TEMPLATE='tt:%l'
|
||||
XVALID=1
|
||||
;;
|
||||
7)
|
||||
# bestaande en nieuwe andere namen (boeken, films, events, .. )
|
||||
EXPR='fp://node[(@neclass="MISC" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="MISC"])]'
|
||||
TEMPLATE='tt:%l'
|
||||
XVALID=1
|
||||
;;
|
||||
8)
|
||||
# nieuwe adjectieven, deelwoorden en werkwoorden
|
||||
EXPR='fp://node[@pt and @his and not(../@his="normal" or @rel="mwp" or ../@his="name" or ../@his_1="decap") and not(@his="normal" or @his="name" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="decap" or @his="within_word_conjunct") and not(@pt="n") ]'
|
||||
TEMPLATE='tt:%w\t%P'
|
||||
XVALID=1
|
||||
;;
|
||||
9|C)
|
||||
# categorieën
|
||||
EXPR='fp://meta[@name="cat"]/@value'
|
||||
TEMPLATE='tt:%m'
|
||||
XVALID=1
|
||||
;;
|
||||
10|T)
|
||||
# tags
|
||||
EXPR='fp://meta[@name="tag"]/@value'
|
||||
TEMPLATE='tt:%m'
|
||||
XVALID=1
|
||||
;;
|
||||
'')
|
||||
;;
|
||||
*)
|
||||
echo Invalid value for option -x
|
||||
exit
|
||||
;;
|
||||
esac
|
||||
|
||||
HEAD=''
|
||||
TAIL=''
|
||||
if [ $LIMIT -gt 0 ]
|
||||
then
|
||||
HEAD=" | flush | head -n $LIMIT"
|
||||
else
|
||||
TAIL=" | flush"
|
||||
fi
|
||||
if [ $USEPAGER = 1 ]
|
||||
then
|
||||
TAIL="$TAIL | ${PAGER:-less}"
|
||||
fi
|
||||
|
||||
if [ $SINGLE = 1 ]
|
||||
then
|
||||
SORT="sed -e 's/\.[0-9][0-9]*$//' | sort | uniq | sed -e 's/\(.*\)\t.*/\1/'"
|
||||
TEMPLATE="$TEMPLATE"'\t%I'
|
||||
else
|
||||
SORT=sort
|
||||
fi
|
||||
|
||||
search () {
|
||||
# $'\t' in bash is een tab
|
||||
CMD="alto $@ '$EXPR' '$TEMPLATE' | $SORT | uniq -c | sed -e 's/\([0-9]\) */\1\t/' | sort -f -k 2 | sort -n -r -k 1,1 -s$HEAD | column -t -s '"$'\t'"' -c 0$TAIL"
|
||||
if [ $VERBOSE = 1 ]
|
||||
then
|
||||
echo "$CMD"
|
||||
fi
|
||||
eval "$CMD"
|
||||
}
|
||||
|
||||
for i in "$@"
|
||||
do
|
||||
case "$i" in
|
||||
*.xml|*.dact|*.dbxml|*.data.dz|*.index|*.zip)
|
||||
;;
|
||||
*)
|
||||
if [ ! -d "$i" ]
|
||||
then
|
||||
usage
|
||||
fi
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [ $# == 0 -a $SELECT = 1 -a $XVALID = 1 ]
|
||||
then
|
||||
cd /net/corpora/nlnieuws
|
||||
select i in `find . -name '*data.dz' | sort`
|
||||
do
|
||||
search $i
|
||||
done
|
||||
elif [ $# -gt 0 -a $SELECT = 0 -a $XVALID = 1 ]
|
||||
then
|
||||
search "$@"
|
||||
else
|
||||
usage
|
||||
fi
|
||||
|
||||
Reference in New Issue
Block a user