collect.sh: stijgers
This commit is contained in:
67
collect.sh
67
collect.sh
@@ -3,8 +3,9 @@
|
|||||||
set -e
|
set -e
|
||||||
|
|
||||||
unset CDPATH
|
unset CDPATH
|
||||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
PATH=/net/corpora/nlnieuws/bin:/net/corpora/nlnieuws/python:/net/aps/bin:$PATH
|
||||||
export TZ=Europe/Amsterdam
|
export TZ=Europe/Amsterdam
|
||||||
|
. /net/corpora/nlnieuws/python/env/bin/activate
|
||||||
|
|
||||||
verbose=0
|
verbose=0
|
||||||
if [ "$1" = "-v" ]
|
if [ "$1" = "-v" ]
|
||||||
@@ -35,6 +36,8 @@ else
|
|||||||
esac
|
esac
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
ds1=$(weekadd $ds -1)
|
||||||
|
|
||||||
year=${ds%%.*}
|
year=${ds%%.*}
|
||||||
|
|
||||||
mkdir -p /net/corpora/nlnieuws/data/$year
|
mkdir -p /net/corpora/nlnieuws/data/$year
|
||||||
@@ -165,7 +168,7 @@ do
|
|||||||
|
|
||||||
# kale tellingen
|
# kale tellingen
|
||||||
|
|
||||||
say $part-count-word-$ds-$i
|
say $part-allewoorden-$ds-$i
|
||||||
alto \
|
alto \
|
||||||
'fp://node[(@pt and not(@pt="let" or @rel="mwp" or @neclass)) or (@cat="mwu" and not(.//node[@neclass]))]' \
|
'fp://node[(@pt and not(@pt="let" or @rel="mwp" or @neclass)) or (@cat="mwu" and not(.//node[@neclass]))]' \
|
||||||
'tt:%l\t%I' $files \
|
'tt:%l\t%I' $files \
|
||||||
@@ -173,49 +176,27 @@ do
|
|||||||
| sed -e 's/\t.*//' | uniq -c \
|
| sed -e 's/\t.*//' | uniq -c \
|
||||||
| grep -v '^ *1 ' \
|
| grep -v '^ *1 ' \
|
||||||
| sed -e 's/\([0-9]\) */\1\t/' | sort -f -k 2 | sort -n -r -k 1,1 -s \
|
| sed -e 's/\([0-9]\) */\1\t/' | sort -f -k 2 | sort -n -r -k 1,1 -s \
|
||||||
> $part-count-word-$ds-$i
|
> $part-allewoorden-$ds-$i
|
||||||
|
|
||||||
say $part-count-loc-$ds-$i
|
|
||||||
alto \
|
|
||||||
'fp://node[(@neclass="LOC" and not(@rel="mwp")) or (@cat="mwu" and .//node[@neclass="LOC" ])]' \
|
|
||||||
'tt:%l\t%I' $files \
|
|
||||||
| sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \
|
|
||||||
| sed -e 's/\t.*//' | uniq -c \
|
|
||||||
| grep -v '^ *1 ' \
|
|
||||||
| sed -e 's/\([0-9]\) */\1\t/' | sort -f -k 2 | sort -n -r -k 1,1 -s \
|
|
||||||
> $part-count-loc-$ds-$i
|
|
||||||
|
|
||||||
say $part-count-per-$ds-$i
|
|
||||||
alto \
|
|
||||||
'fp://node[(@neclass="PER" and not(@rel="mwp")) or (@cat="mwu" and .//node[@neclass="PER" ])]' \
|
|
||||||
'tt:%l\t%I' $files \
|
|
||||||
| sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \
|
|
||||||
| sed -e 's/\t.*//' | uniq -c \
|
|
||||||
| grep -v '^ *1 ' \
|
|
||||||
| sed -e 's/\([0-9]\) */\1\t/' | sort -f -k 2 | sort -n -r -k 1,1 -s \
|
|
||||||
> $part-count-per-$ds-$i
|
|
||||||
|
|
||||||
say $part-count-org-$ds-$i
|
|
||||||
alto \
|
|
||||||
'fp://node[(@neclass="ORG" and not(@rel="mwp")) or (@cat="mwu" and .//node[@neclass="ORG" ])]' \
|
|
||||||
'tt:%l\t%I' $files \
|
|
||||||
| sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \
|
|
||||||
| sed -e 's/\t.*//' | uniq -c \
|
|
||||||
| grep -v '^ *1 ' \
|
|
||||||
| sed -e 's/\([0-9]\) */\1\t/' | sort -f -k 2 | sort -n -r -k 1,1 -s \
|
|
||||||
> $part-count-org-$ds-$i
|
|
||||||
|
|
||||||
say $part-count-misc-$ds-$i
|
|
||||||
alto \
|
|
||||||
'fp://node[(@neclass="MISC" and not(@rel="mwp")) or (@cat="mwu" and .//node[@neclass="MISC" ])]' \
|
|
||||||
'tt:%l\t%I' $files \
|
|
||||||
| sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \
|
|
||||||
| sed -e 's/\t.*//' | uniq -c \
|
|
||||||
| grep -v '^ *1 ' \
|
|
||||||
| sed -e 's/\([0-9]\) */\1\t/' | sort -f -k 2 | sort -n -r -k 1,1 -s \
|
|
||||||
> $part-count-misc-$ds-$i
|
|
||||||
|
|
||||||
done
|
done
|
||||||
|
|
||||||
|
# score
|
||||||
|
say "$part-allewoorden-$ds1-4 $part-allewoorden-$ds-1 > $part-allewoorden-$ds.score14"
|
||||||
|
stijgers.py $part-allewoorden-$ds1-4 $part-allewoorden-$ds-1 > $part-allewoorden-$ds.score14
|
||||||
|
|
||||||
|
say "$part-locaties-$ds1-4 $part-locaties-$ds-1 > $part-locaties-$ds.score14"
|
||||||
|
stijgers.py $part-locaties-$ds1-4 $part-locaties-$ds-1 > $part-locaties-$ds.score14
|
||||||
|
|
||||||
|
say "$part-personen-$ds1-4 $part-personen-$ds-1 > $part-personen-$ds.score14"
|
||||||
|
stijgers.py $part-personen-$ds1-4 $part-personen-$ds-1 > $part-personen-$ds.score14
|
||||||
|
|
||||||
|
say "$part-organisaties-$ds1-4 $part-organisaties-$ds-1 > $part-organisaties-$ds.score14"
|
||||||
|
stijgers.py $part-organisaties-$ds1-4 $part-organisaties-$ds-1 > $part-organisaties-$ds.score14
|
||||||
|
|
||||||
|
say "$part-overige-namen-$ds1-4 $part-overige-namen-$ds-1 > $part-overige-namen-$ds.score14"
|
||||||
|
stijgers.py $part-overige-namen-$ds1-4 $part-overige-namen-$ds-1 > $part-overige-namen-$ds.score14
|
||||||
|
|
||||||
|
|
||||||
done
|
done
|
||||||
|
|
||||||
data2json $ds 1 > ../json/$year/DATA-$ds-1.json
|
data2json $ds 1 > ../json/$year/DATA-$ds-1.json
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
#!/net/corpora/nlnieuws/notebook/bin/python3
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|||||||
Reference in New Issue
Block a user