grote reorganisatie:
- HLN, NOS, NU, VRT: per week -> per dag - yyyy-ww -> yyyy.ww - yyyy* -> yyyy/yyyy* etc
This commit is contained in:
@@ -77,7 +77,7 @@ func main() {
|
|||||||
}
|
}
|
||||||
p(err)
|
p(err)
|
||||||
year, week := t.ISOWeek()
|
year, week := t.ISOWeek()
|
||||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/AT5/%d/%02d", year, week)
|
dirname := fmt.Sprintf("/net/corpora/nlnieuws/AT5/%d/w%02d", year, week)
|
||||||
if exists(dirname + "/lock") {
|
if exists(dirname + "/lock") {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -22,7 +22,7 @@ type Item struct {
|
|||||||
var (
|
var (
|
||||||
x = e.ExitErr
|
x = e.ExitErr
|
||||||
|
|
||||||
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`)
|
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]\.[0-5][0-9]$`)
|
||||||
)
|
)
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
@@ -31,16 +31,16 @@ func main() {
|
|||||||
switch len(os.Args) {
|
switch len(os.Args) {
|
||||||
case 1:
|
case 1:
|
||||||
year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
|
year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
|
||||||
ds = fmt.Sprintf("%d-%02d", year, week)
|
ds = fmt.Sprintf("%d.%02d", year, week)
|
||||||
case 2:
|
case 2:
|
||||||
if !reYearWeek.MatchString(os.Args[1]) {
|
if !reYearWeek.MatchString(os.Args[1]) {
|
||||||
x(fmt.Errorf("arg must be yyyy-ww"))
|
x(fmt.Errorf("arg must be yyyy.ww"))
|
||||||
}
|
}
|
||||||
ds = os.Args[1]
|
ds = os.Args[1]
|
||||||
default:
|
default:
|
||||||
x(fmt.Errorf("too many arguments"))
|
x(fmt.Errorf("too many arguments"))
|
||||||
}
|
}
|
||||||
dp := ds[:4] + "/" + ds[5:]
|
dp := ds[:4] + "/w" + ds[5:]
|
||||||
|
|
||||||
x(os.Chdir("/net/corpora/nlnieuws/AT5/" + dp))
|
x(os.Chdir("/net/corpora/nlnieuws/AT5/" + dp))
|
||||||
x(os.MkdirAll("out", 0777))
|
x(os.MkdirAll("out", 0777))
|
||||||
|
|||||||
@@ -2,17 +2,20 @@
|
|||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
|
BASE=/net/corpora/nlnieuws
|
||||||
|
PART=$BASE/AT5
|
||||||
|
|
||||||
unset CDPATH
|
unset CDPATH
|
||||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
|
||||||
export TZ=Europe/Amsterdam
|
export TZ=Europe/Amsterdam
|
||||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||||
|
|
||||||
if [ "$1" = "" ]
|
if [ "$1" = "" ]
|
||||||
then
|
then
|
||||||
ds=`date -d -7days +%G-%V`
|
ds=`date -d -7days +%G.%V`
|
||||||
else
|
else
|
||||||
case "$1" in
|
case "$1" in
|
||||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
2[0-9][0-9][0-9].[0-5][0-9])
|
||||||
ds=$1
|
ds=$1
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
@@ -22,11 +25,13 @@ else
|
|||||||
esac
|
esac
|
||||||
fi
|
fi
|
||||||
|
|
||||||
dp=${ds//-//}
|
year=${ds%.*}
|
||||||
|
week=${ds#*.}
|
||||||
|
dp=$year/w$week
|
||||||
|
corpus=$PART/corpus/$year/$ds
|
||||||
|
mkdir -p $PART/corpus/$year
|
||||||
|
|
||||||
corpus=/net/corpora/nlnieuws/AT5/corpus/$ds
|
cd $PART/$dp
|
||||||
|
|
||||||
cd /net/corpora/nlnieuws/AT5/$dp
|
|
||||||
|
|
||||||
ln -s lock.$$ lock
|
ln -s lock.$$ lock
|
||||||
if [ "`readlink lock`" != lock.$$ ]
|
if [ "`readlink lock`" != lock.$$ ]
|
||||||
@@ -38,7 +43,7 @@ fi
|
|||||||
rm -fr out
|
rm -fr out
|
||||||
mkdir out
|
mkdir out
|
||||||
|
|
||||||
../../xml2txt $ds
|
xml2txt $ds
|
||||||
|
|
||||||
rm -f $corpus.lines
|
rm -f $corpus.lines
|
||||||
for i in out/*.txt
|
for i in out/*.txt
|
||||||
@@ -53,7 +58,7 @@ cd out
|
|||||||
mkdir xml
|
mkdir xml
|
||||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||||
|
|
||||||
../../../metadata
|
metadata
|
||||||
|
|
||||||
cd xml
|
cd xml
|
||||||
rm -f $corpus.data.dz $corpus.index
|
rm -f $corpus.data.dz $corpus.index
|
||||||
|
|||||||
@@ -95,7 +95,7 @@ func main() {
|
|||||||
}
|
}
|
||||||
p(err)
|
p(err)
|
||||||
year, week := t.ISOWeek()
|
year, week := t.ISOWeek()
|
||||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/BuurtAdam/%d/%02d", year, week)
|
dirname := fmt.Sprintf("/net/corpora/nlnieuws/BuurtAdam/%d/w%02d", year, week)
|
||||||
if exists(dirname + "/lock") {
|
if exists(dirname + "/lock") {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,17 +2,20 @@
|
|||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
|
BASE=/net/corpora/nlnieuws
|
||||||
|
PART=$BASE/BuurtAdam
|
||||||
|
|
||||||
unset CDPATH
|
unset CDPATH
|
||||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
|
||||||
export TZ=Europe/Amsterdam
|
export TZ=Europe/Amsterdam
|
||||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||||
|
|
||||||
if [ "$1" = "" ]
|
if [ "$1" = "" ]
|
||||||
then
|
then
|
||||||
ds=`date -d -7days +%G-%V`
|
ds=`date -d -7days +%G.%V`
|
||||||
else
|
else
|
||||||
case "$1" in
|
case "$1" in
|
||||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
2[0-9][0-9][0-9].[0-5][0-9])
|
||||||
ds=$1
|
ds=$1
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
@@ -22,11 +25,13 @@ else
|
|||||||
esac
|
esac
|
||||||
fi
|
fi
|
||||||
|
|
||||||
dp=${ds//-//}
|
year=${ds%.*}
|
||||||
|
week=${ds#*.}
|
||||||
|
dp=$year/w$week
|
||||||
|
corpus=$PART/corpus/$year/$ds
|
||||||
|
mkdir -p $PART/corpus/$year
|
||||||
|
|
||||||
corpus=/net/corpora/nlnieuws/BuurtAdam/corpus/$ds
|
cd $PART/$dp
|
||||||
|
|
||||||
cd /net/corpora/nlnieuws/BuurtAdam/$dp
|
|
||||||
|
|
||||||
ln -s lock.$$ lock
|
ln -s lock.$$ lock
|
||||||
if [ "`readlink lock`" != lock.$$ ]
|
if [ "`readlink lock`" != lock.$$ ]
|
||||||
@@ -51,14 +56,14 @@ cd out
|
|||||||
mkdir xml
|
mkdir xml
|
||||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||||
|
|
||||||
../../../metadata
|
metadata
|
||||||
|
|
||||||
cd xml
|
cd xml
|
||||||
rm -f $corpus.data.dz $corpus.index
|
rm -f $corpus.data.dz $corpus.index
|
||||||
alto -q -o $corpus.data.dz *.xml
|
alto -q -o $corpus.data.dz *.xml
|
||||||
|
|
||||||
# telling per bericht, niet per zin
|
# telling per bericht, niet per zin
|
||||||
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||||
|
|
||||||
cd ../..
|
cd ../..
|
||||||
rm -fr out
|
rm -fr out
|
||||||
|
|||||||
@@ -94,7 +94,7 @@ func main() {
|
|||||||
}
|
}
|
||||||
p(err)
|
p(err)
|
||||||
year, week := t.ISOWeek()
|
year, week := t.ISOWeek()
|
||||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/BuurtGrn/%d/%02d", year, week)
|
dirname := fmt.Sprintf("/net/corpora/nlnieuws/BuurtGrn/%d/w%02d", year, week)
|
||||||
if exists(dirname + "/lock") {
|
if exists(dirname + "/lock") {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,17 +2,20 @@
|
|||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
|
BASE=/net/corpora/nlnieuws
|
||||||
|
PART=$BASE/BuurtGrn
|
||||||
|
|
||||||
unset CDPATH
|
unset CDPATH
|
||||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
|
||||||
export TZ=Europe/Amsterdam
|
export TZ=Europe/Amsterdam
|
||||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||||
|
|
||||||
if [ "$1" = "" ]
|
if [ "$1" = "" ]
|
||||||
then
|
then
|
||||||
ds=`date -d -7days +%G-%V`
|
ds=`date -d -7days +%G.%V`
|
||||||
else
|
else
|
||||||
case "$1" in
|
case "$1" in
|
||||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
2[0-9][0-9][0-9].[0-5][0-9])
|
||||||
ds=$1
|
ds=$1
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
@@ -22,11 +25,13 @@ else
|
|||||||
esac
|
esac
|
||||||
fi
|
fi
|
||||||
|
|
||||||
dp=${ds//-//}
|
year=${ds%.*}
|
||||||
|
week=${ds#*.}
|
||||||
|
dp=$year/w$week
|
||||||
|
corpus=$PART/corpus/$year/$ds
|
||||||
|
mkdir -p $PART/corpus/$year
|
||||||
|
|
||||||
corpus=/net/corpora/nlnieuws/BuurtGrn/corpus/$ds
|
cd $PART/$dp
|
||||||
|
|
||||||
cd /net/corpora/nlnieuws/BuurtGrn/$dp
|
|
||||||
|
|
||||||
ln -s lock.$$ lock
|
ln -s lock.$$ lock
|
||||||
if [ "`readlink lock`" != lock.$$ ]
|
if [ "`readlink lock`" != lock.$$ ]
|
||||||
@@ -51,14 +56,14 @@ cd out
|
|||||||
mkdir xml
|
mkdir xml
|
||||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||||
|
|
||||||
../../../metadata
|
metadata
|
||||||
|
|
||||||
cd xml
|
cd xml
|
||||||
rm -f $corpus.data.dz $corpus.index
|
rm -f $corpus.data.dz $corpus.index
|
||||||
alto -q -o $corpus.data.dz *.xml
|
alto -q -o $corpus.data.dz *.xml
|
||||||
|
|
||||||
# telling per bericht, niet per zin
|
# telling per bericht, niet per zin
|
||||||
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||||
|
|
||||||
cd ../..
|
cd ../..
|
||||||
rm -fr out
|
rm -fr out
|
||||||
|
|||||||
@@ -95,7 +95,7 @@ func main() {
|
|||||||
}
|
}
|
||||||
p(err)
|
p(err)
|
||||||
year, week := t.ISOWeek()
|
year, week := t.ISOWeek()
|
||||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/GG/%d/%02d", year, week)
|
dirname := fmt.Sprintf("/net/corpora/nlnieuws/GG/%d/w%02d", year, week)
|
||||||
if exists(dirname + "/lock") {
|
if exists(dirname + "/lock") {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,17 +2,20 @@
|
|||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
|
BASE=/net/corpora/nlnieuws
|
||||||
|
PART=$BASE/GG
|
||||||
|
|
||||||
unset CDPATH
|
unset CDPATH
|
||||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
|
||||||
export TZ=Europe/Amsterdam
|
export TZ=Europe/Amsterdam
|
||||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||||
|
|
||||||
if [ "$1" = "" ]
|
if [ "$1" = "" ]
|
||||||
then
|
then
|
||||||
ds=`date -d -7days +%G-%V`
|
ds=`date -d -7days +%G.%V`
|
||||||
else
|
else
|
||||||
case "$1" in
|
case "$1" in
|
||||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
2[0-9][0-9][0-9].[0-5][0-9])
|
||||||
ds=$1
|
ds=$1
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
@@ -22,11 +25,13 @@ else
|
|||||||
esac
|
esac
|
||||||
fi
|
fi
|
||||||
|
|
||||||
dp=${ds//-//}
|
year=${ds%.*}
|
||||||
|
week=${ds#*.}
|
||||||
|
dp=$year/w$week
|
||||||
|
corpus=$PART/corpus/$year/$ds
|
||||||
|
mkdir -p $PART/corpus/$year
|
||||||
|
|
||||||
corpus=/net/corpora/nlnieuws/GG/corpus/$ds
|
cd $PART/$dp
|
||||||
|
|
||||||
cd /net/corpora/nlnieuws/GG/$dp
|
|
||||||
|
|
||||||
ln -s lock.$$ lock
|
ln -s lock.$$ lock
|
||||||
if [ "`readlink lock`" != lock.$$ ]
|
if [ "`readlink lock`" != lock.$$ ]
|
||||||
@@ -51,7 +56,7 @@ cd out
|
|||||||
mkdir xml
|
mkdir xml
|
||||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||||
|
|
||||||
../../../metadata
|
metadata
|
||||||
|
|
||||||
cd xml
|
cd xml
|
||||||
rm -f $corpus.data.dz $corpus.index
|
rm -f $corpus.data.dz $corpus.index
|
||||||
|
|||||||
@@ -101,8 +101,7 @@ func main() {
|
|||||||
t, err = time.Parse(time.RFC1123, item.PubDate)
|
t, err = time.Parse(time.RFC1123, item.PubDate)
|
||||||
}
|
}
|
||||||
p(err)
|
p(err)
|
||||||
year, week := t.ISOWeek()
|
dirname := fmt.Sprintf("/net/corpora/nlnieuws/HLN/%d/%02d/%02d", t.Year(), int(t.Month()), t.Day())
|
||||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/HLN/%d/%02d", year, week)
|
|
||||||
if exists(dirname + "/lock") {
|
if exists(dirname + "/lock") {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,17 +2,20 @@
|
|||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
|
BASE=/net/corpora/nlnieuws
|
||||||
|
PART=$BASE/HLN
|
||||||
|
|
||||||
unset CDPATH
|
unset CDPATH
|
||||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
|
||||||
export TZ=Europe/Amsterdam
|
export TZ=Europe/Amsterdam
|
||||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||||
|
|
||||||
if [ "$1" = "" ]
|
if [ "$1" = "" ]
|
||||||
then
|
then
|
||||||
ds=`date -d -7days +%G-%V`
|
ds=`date -d -2days +%Y-%m-%d`
|
||||||
else
|
else
|
||||||
case "$1" in
|
case "$1" in
|
||||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9])
|
||||||
ds=$1
|
ds=$1
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
@@ -23,10 +26,11 @@ else
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
dp=${ds//-//}
|
dp=${ds//-//}
|
||||||
|
year=${ds%%-*}
|
||||||
|
corpus=$PART/corpus/$year/$ds
|
||||||
|
mkdir -p $PART/corpus/$year
|
||||||
|
|
||||||
corpus=/net/corpora/nlnieuws/HLN/corpus/$ds
|
cd $PART/$dp
|
||||||
|
|
||||||
cd /net/corpora/nlnieuws/HLN/$dp
|
|
||||||
|
|
||||||
ln -s lock.$$ lock
|
ln -s lock.$$ lock
|
||||||
if [ "`readlink lock`" != lock.$$ ]
|
if [ "`readlink lock`" != lock.$$ ]
|
||||||
@@ -51,14 +55,14 @@ cd out
|
|||||||
mkdir xml
|
mkdir xml
|
||||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||||
|
|
||||||
../../../metadata
|
metadata
|
||||||
|
|
||||||
cd xml
|
cd xml
|
||||||
rm -f $corpus.data.dz $corpus.index
|
rm -f $corpus.data.dz $corpus.index
|
||||||
alto -q -o $corpus.data.dz *.xml
|
alto -q -o $corpus.data.dz *.xml
|
||||||
|
|
||||||
# telling per bericht, niet per zin
|
# telling per bericht, niet per zin
|
||||||
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||||
|
|
||||||
cd ../..
|
cd ../..
|
||||||
rm -fr out
|
rm -fr out
|
||||||
|
|||||||
@@ -77,7 +77,7 @@ func main() {
|
|||||||
}
|
}
|
||||||
p(err)
|
p(err)
|
||||||
year, week := t.ISOWeek()
|
year, week := t.ISOWeek()
|
||||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/LitNL/%d/%02d", year, week)
|
dirname := fmt.Sprintf("/net/corpora/nlnieuws/LitNL/%d/w%02d", year, week)
|
||||||
if exists(dirname + "/lock") {
|
if exists(dirname + "/lock") {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -24,7 +24,7 @@ var (
|
|||||||
w = e.WarnErr
|
w = e.WarnErr
|
||||||
x = e.ExitErr
|
x = e.ExitErr
|
||||||
|
|
||||||
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`)
|
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]\.[0-5][0-9]$`)
|
||||||
)
|
)
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
@@ -33,16 +33,16 @@ func main() {
|
|||||||
switch len(os.Args) {
|
switch len(os.Args) {
|
||||||
case 1:
|
case 1:
|
||||||
year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
|
year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
|
||||||
ds = fmt.Sprintf("%d-%02d", year, week)
|
ds = fmt.Sprintf("%d.%02d", year, week)
|
||||||
case 2:
|
case 2:
|
||||||
if !reYearWeek.MatchString(os.Args[1]) {
|
if !reYearWeek.MatchString(os.Args[1]) {
|
||||||
x(fmt.Errorf("arg must be yyyy-ww"))
|
x(fmt.Errorf("arg must be yyyy.ww"))
|
||||||
}
|
}
|
||||||
ds = os.Args[1]
|
ds = os.Args[1]
|
||||||
default:
|
default:
|
||||||
x(fmt.Errorf("too many arguments"))
|
x(fmt.Errorf("too many arguments"))
|
||||||
}
|
}
|
||||||
dp := ds[:4] + "/" + ds[5:]
|
dp := ds[:4] + "/w" + ds[5:]
|
||||||
|
|
||||||
x(os.Chdir("/net/corpora/nlnieuws/LitNL/" + dp))
|
x(os.Chdir("/net/corpora/nlnieuws/LitNL/" + dp))
|
||||||
x(os.MkdirAll("out", 0777))
|
x(os.MkdirAll("out", 0777))
|
||||||
|
|||||||
@@ -2,17 +2,20 @@
|
|||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
|
BASE=/net/corpora/nlnieuws
|
||||||
|
PART=$BASE/LitNL
|
||||||
|
|
||||||
unset CDPATH
|
unset CDPATH
|
||||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
|
||||||
export TZ=Europe/Amsterdam
|
export TZ=Europe/Amsterdam
|
||||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||||
|
|
||||||
if [ "$1" = "" ]
|
if [ "$1" = "" ]
|
||||||
then
|
then
|
||||||
ds=`date -d -7days +%G-%V`
|
ds=`date -d -7days +%G.%V`
|
||||||
else
|
else
|
||||||
case "$1" in
|
case "$1" in
|
||||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
2[0-9][0-9][0-9].[0-5][0-9])
|
||||||
ds=$1
|
ds=$1
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
@@ -22,11 +25,13 @@ else
|
|||||||
esac
|
esac
|
||||||
fi
|
fi
|
||||||
|
|
||||||
dp=${ds//-//}
|
year=${ds%.*}
|
||||||
|
week=${ds#*.}
|
||||||
|
dp=$year/w$week
|
||||||
|
corpus=$PART/corpus/$year/$ds
|
||||||
|
mkdir -p $PART/corpus/$year
|
||||||
|
|
||||||
corpus=/net/corpora/nlnieuws/LitNL/corpus/$ds
|
cd $PART/$dp
|
||||||
|
|
||||||
cd /net/corpora/nlnieuws/LitNL/$dp
|
|
||||||
|
|
||||||
ln -s lock.$$ lock
|
ln -s lock.$$ lock
|
||||||
if [ "`readlink lock`" != lock.$$ ]
|
if [ "`readlink lock`" != lock.$$ ]
|
||||||
@@ -38,7 +43,7 @@ fi
|
|||||||
rm -fr out
|
rm -fr out
|
||||||
mkdir out
|
mkdir out
|
||||||
|
|
||||||
../../xml2txt $ds
|
xml2txt $ds
|
||||||
|
|
||||||
rm -f $corpus.lines
|
rm -f $corpus.lines
|
||||||
for i in out/*.txt
|
for i in out/*.txt
|
||||||
@@ -53,14 +58,14 @@ cd out
|
|||||||
mkdir xml
|
mkdir xml
|
||||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||||
|
|
||||||
../../../metadata
|
metadata
|
||||||
|
|
||||||
cd xml
|
cd xml
|
||||||
rm -f $corpus.data.dz $corpus.index
|
rm -f $corpus.data.dz $corpus.index
|
||||||
alto -q -o $corpus.data.dz *.xml
|
alto -q -o $corpus.data.dz *.xml
|
||||||
|
|
||||||
# telling per bericht, niet per zin
|
# telling per bericht, niet per zin
|
||||||
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||||
|
|
||||||
cd ../..
|
cd ../..
|
||||||
rm -fr out
|
rm -fr out
|
||||||
|
|||||||
@@ -28,7 +28,7 @@ type Item struct {
|
|||||||
var (
|
var (
|
||||||
x = e.ExitErr
|
x = e.ExitErr
|
||||||
|
|
||||||
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`)
|
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9]$`)
|
||||||
)
|
)
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
@@ -36,17 +36,17 @@ func main() {
|
|||||||
var ds string
|
var ds string
|
||||||
switch len(os.Args) {
|
switch len(os.Args) {
|
||||||
case 1:
|
case 1:
|
||||||
year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
|
t := time.Now().AddDate(0, 0, -2)
|
||||||
ds = fmt.Sprintf("%d-%02d", year, week)
|
ds = fmt.Sprintf("%d-%02d-%02d", t.Year(), int(t.Month()), t.Day())
|
||||||
case 2:
|
case 2:
|
||||||
if !reYearWeek.MatchString(os.Args[1]) {
|
if !reYearWeek.MatchString(os.Args[1]) {
|
||||||
x(fmt.Errorf("arg must be yyyy-ww"))
|
x(fmt.Errorf("arg must be yyyy-mm-dd"))
|
||||||
}
|
}
|
||||||
ds = os.Args[1]
|
ds = os.Args[1]
|
||||||
default:
|
default:
|
||||||
x(fmt.Errorf("too many arguments"))
|
x(fmt.Errorf("too many arguments"))
|
||||||
}
|
}
|
||||||
dp := ds[:4] + "/" + ds[5:]
|
dp := strings.ReplaceAll(ds, "-", "/")
|
||||||
|
|
||||||
x(os.Chdir("/net/corpora/nlnieuws/NOS/" + dp))
|
x(os.Chdir("/net/corpora/nlnieuws/NOS/" + dp))
|
||||||
x(os.MkdirAll("out", 0777))
|
x(os.MkdirAll("out", 0777))
|
||||||
|
|||||||
@@ -94,8 +94,7 @@ func main() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
p(err)
|
p(err)
|
||||||
year, week := t.ISOWeek()
|
dirname := fmt.Sprintf("/net/corpora/nlnieuws/NOS/%d/%02d/%02d", t.Year(), int(t.Month()), t.Day())
|
||||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/NOS/%d/%02d", year, week)
|
|
||||||
if exists(dirname + "/lock") {
|
if exists(dirname + "/lock") {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,17 +2,20 @@
|
|||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
|
BASE=/net/corpora/nlnieuws
|
||||||
|
PART=$BASE/NOS
|
||||||
|
|
||||||
unset CDPATH
|
unset CDPATH
|
||||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
|
||||||
export TZ=Europe/Amsterdam
|
export TZ=Europe/Amsterdam
|
||||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||||
|
|
||||||
if [ "$1" = "" ]
|
if [ "$1" = "" ]
|
||||||
then
|
then
|
||||||
ds=`date -d -7days +%G-%V`
|
ds=`date -d -2days +%Y-%m-%d`
|
||||||
else
|
else
|
||||||
case "$1" in
|
case "$1" in
|
||||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9])
|
||||||
ds=$1
|
ds=$1
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
@@ -23,10 +26,11 @@ else
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
dp=${ds//-//}
|
dp=${ds//-//}
|
||||||
|
year=${ds%%-*}
|
||||||
|
corpus=$PART/corpus/$year/$ds
|
||||||
|
mkdir -p $PART/corpus/$year
|
||||||
|
|
||||||
corpus=/net/corpora/nlnieuws/NOS/corpus/$ds
|
cd $PART/$dp
|
||||||
|
|
||||||
cd /net/corpora/nlnieuws/NOS/$dp
|
|
||||||
|
|
||||||
ln -s lock.$$ lock
|
ln -s lock.$$ lock
|
||||||
if [ "`readlink lock`" != lock.$$ ]
|
if [ "`readlink lock`" != lock.$$ ]
|
||||||
@@ -38,7 +42,7 @@ fi
|
|||||||
rm -fr out
|
rm -fr out
|
||||||
mkdir out
|
mkdir out
|
||||||
|
|
||||||
../../json2txt $ds
|
json2txt $ds
|
||||||
|
|
||||||
rm -f $corpus.lines
|
rm -f $corpus.lines
|
||||||
for i in out/*.txt
|
for i in out/*.txt
|
||||||
@@ -53,15 +57,15 @@ cd out
|
|||||||
mkdir xml
|
mkdir xml
|
||||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||||
|
|
||||||
../../../metadata
|
metadata
|
||||||
|
|
||||||
cd xml
|
cd xml
|
||||||
rm -f $corpus.data.dz $corpus.index
|
rm -f $corpus.data.dz $corpus.index
|
||||||
alto -q -o $corpus.data.dz *.xml
|
alto -q -o $corpus.data.dz *.xml
|
||||||
|
|
||||||
# telling per bericht, niet per zin
|
# telling per bericht, niet per zin
|
||||||
/net/corpora/nlnieuws/namen.sh -x C -s $corpus.data.dz > $corpus.cat.txt
|
query.sh -x C -s $corpus.data.dz > $corpus.cat.txt
|
||||||
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||||
|
|
||||||
cd ../..
|
cd ../..
|
||||||
rm -fr out
|
rm -fr out
|
||||||
|
|||||||
@@ -98,8 +98,7 @@ func main() {
|
|||||||
t, err = time.Parse(time.RFC1123, item.PubDate)
|
t, err = time.Parse(time.RFC1123, item.PubDate)
|
||||||
}
|
}
|
||||||
p(err)
|
p(err)
|
||||||
year, week := t.ISOWeek()
|
dirname := fmt.Sprintf("/net/corpora/nlnieuws/NU/%d/%02d/%02d", t.Year(), int(t.Month()), t.Day())
|
||||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/NU/%d/%02d", year, week)
|
|
||||||
if exists(dirname + "/lock") {
|
if exists(dirname + "/lock") {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,17 +2,20 @@
|
|||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
|
BASE=/net/corpora/nlnieuws
|
||||||
|
PART=$BASE/NU
|
||||||
|
|
||||||
unset CDPATH
|
unset CDPATH
|
||||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
|
||||||
export TZ=Europe/Amsterdam
|
export TZ=Europe/Amsterdam
|
||||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||||
|
|
||||||
if [ "$1" = "" ]
|
if [ "$1" = "" ]
|
||||||
then
|
then
|
||||||
ds=`date -d -7days +%G-%V`
|
ds=`date -d -2days +%Y-%m-%d`
|
||||||
else
|
else
|
||||||
case "$1" in
|
case "$1" in
|
||||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9])
|
||||||
ds=$1
|
ds=$1
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
@@ -23,10 +26,11 @@ else
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
dp=${ds//-//}
|
dp=${ds//-//}
|
||||||
|
year=${ds%%-*}
|
||||||
|
corpus=$PART/corpus/$year/$ds
|
||||||
|
mkdir -p $PART/corpus/$year
|
||||||
|
|
||||||
corpus=/net/corpora/nlnieuws/NU/corpus/$ds
|
cd $PART/$dp
|
||||||
|
|
||||||
cd /net/corpora/nlnieuws/NU/$dp
|
|
||||||
|
|
||||||
ln -s lock.$$ lock
|
ln -s lock.$$ lock
|
||||||
if [ "`readlink lock`" != lock.$$ ]
|
if [ "`readlink lock`" != lock.$$ ]
|
||||||
@@ -51,14 +55,14 @@ cd out
|
|||||||
mkdir xml
|
mkdir xml
|
||||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||||
|
|
||||||
../../../metadata
|
metadata
|
||||||
|
|
||||||
cd xml
|
cd xml
|
||||||
rm -f $corpus.data.dz $corpus.index
|
rm -f $corpus.data.dz $corpus.index
|
||||||
alto -q -o $corpus.data.dz *.xml
|
alto -q -o $corpus.data.dz *.xml
|
||||||
|
|
||||||
# telling per bericht, niet per zin
|
# telling per bericht, niet per zin
|
||||||
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||||
|
|
||||||
cd ../..
|
cd ../..
|
||||||
rm -fr out
|
rm -fr out
|
||||||
|
|||||||
@@ -2,8 +2,11 @@
|
|||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
|
BASE=/net/corpora/nlnieuws
|
||||||
|
PART=$BASE/NieuwsNL
|
||||||
|
|
||||||
unset CDPATH
|
unset CDPATH
|
||||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
|
||||||
export TZ=Europe/Amsterdam
|
export TZ=Europe/Amsterdam
|
||||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||||
|
|
||||||
@@ -25,10 +28,11 @@ else
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
dp=${ds//-//}
|
dp=${ds//-//}
|
||||||
|
year=${ds%%-*}
|
||||||
|
corpus=$PART/corpus/$year/$ds
|
||||||
|
mkdir -p $PART/corpus/$year
|
||||||
|
|
||||||
corpus=/net/corpora/nlnieuws/NieuwsNL/corpus/$ds
|
cd $PART/$dp
|
||||||
|
|
||||||
cd /net/corpora/nlnieuws/NieuwsNL/$dp
|
|
||||||
|
|
||||||
ln -s lock.$$ lock
|
ln -s lock.$$ lock
|
||||||
if [ "`readlink lock`" != lock.$$ ]
|
if [ "`readlink lock`" != lock.$$ ]
|
||||||
@@ -53,14 +57,14 @@ cd out
|
|||||||
mkdir xml
|
mkdir xml
|
||||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||||
|
|
||||||
../../../../metadata
|
metadata
|
||||||
|
|
||||||
cd xml
|
cd xml
|
||||||
rm -f $corpus.data.dz $corpus.index
|
rm -f $corpus.data.dz $corpus.index
|
||||||
alto -q -o $corpus.data.dz *.xml
|
alto -q -o $corpus.data.dz *.xml
|
||||||
|
|
||||||
# telling per bericht, niet per zin
|
# telling per bericht, niet per zin
|
||||||
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||||
|
|
||||||
cd ../..
|
cd ../..
|
||||||
rm -fr out
|
rm -fr out
|
||||||
|
|||||||
@@ -77,7 +77,7 @@ func main() {
|
|||||||
}
|
}
|
||||||
p(err)
|
p(err)
|
||||||
year, week := t.ISOWeek()
|
year, week := t.ISOWeek()
|
||||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Oog/%d/%02d", year, week)
|
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Oog/%d/w%02d", year, week)
|
||||||
if exists(dirname + "/lock") {
|
if exists(dirname + "/lock") {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ type Item struct {
|
|||||||
var (
|
var (
|
||||||
x = e.ExitErr
|
x = e.ExitErr
|
||||||
|
|
||||||
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`)
|
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]\.[0-5][0-9]$`)
|
||||||
)
|
)
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
@@ -32,16 +32,16 @@ func main() {
|
|||||||
switch len(os.Args) {
|
switch len(os.Args) {
|
||||||
case 1:
|
case 1:
|
||||||
year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
|
year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
|
||||||
ds = fmt.Sprintf("%d-%02d", year, week)
|
ds = fmt.Sprintf("%d.%02d", year, week)
|
||||||
case 2:
|
case 2:
|
||||||
if !reYearWeek.MatchString(os.Args[1]) {
|
if !reYearWeek.MatchString(os.Args[1]) {
|
||||||
x(fmt.Errorf("arg must be yyyy-ww"))
|
x(fmt.Errorf("arg must be yyyy.ww"))
|
||||||
}
|
}
|
||||||
ds = os.Args[1]
|
ds = os.Args[1]
|
||||||
default:
|
default:
|
||||||
x(fmt.Errorf("too many arguments"))
|
x(fmt.Errorf("too many arguments"))
|
||||||
}
|
}
|
||||||
dp := ds[:4] + "/" + ds[5:]
|
dp := ds[:4] + "/w" + ds[5:]
|
||||||
|
|
||||||
x(os.Chdir("/net/corpora/nlnieuws/Oog/" + dp))
|
x(os.Chdir("/net/corpora/nlnieuws/Oog/" + dp))
|
||||||
x(os.MkdirAll("out", 0777))
|
x(os.MkdirAll("out", 0777))
|
||||||
|
|||||||
@@ -2,17 +2,20 @@
|
|||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
|
BASE=/net/corpora/nlnieuws
|
||||||
|
PART=$BASE/Oog
|
||||||
|
|
||||||
unset CDPATH
|
unset CDPATH
|
||||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
|
||||||
export TZ=Europe/Amsterdam
|
export TZ=Europe/Amsterdam
|
||||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||||
|
|
||||||
if [ "$1" = "" ]
|
if [ "$1" = "" ]
|
||||||
then
|
then
|
||||||
ds=`date -d -7days +%G-%V`
|
ds=`date -d -7days +%G.%V`
|
||||||
else
|
else
|
||||||
case "$1" in
|
case "$1" in
|
||||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
2[0-9][0-9][0-9].[0-5][0-9])
|
||||||
ds=$1
|
ds=$1
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
@@ -22,11 +25,13 @@ else
|
|||||||
esac
|
esac
|
||||||
fi
|
fi
|
||||||
|
|
||||||
dp=${ds//-//}
|
year=${ds%.*}
|
||||||
|
week=${ds#*.}
|
||||||
|
dp=$year/w$week
|
||||||
|
corpus=$PART/corpus/$year/$ds
|
||||||
|
mkdir -p $PART/corpus/$year
|
||||||
|
|
||||||
corpus=/net/corpora/nlnieuws/Oog/corpus/$ds
|
cd $PART/$dp
|
||||||
|
|
||||||
cd /net/corpora/nlnieuws/Oog/$dp
|
|
||||||
|
|
||||||
ln -s lock.$$ lock
|
ln -s lock.$$ lock
|
||||||
if [ "`readlink lock`" != lock.$$ ]
|
if [ "`readlink lock`" != lock.$$ ]
|
||||||
@@ -38,7 +43,7 @@ fi
|
|||||||
rm -fr out
|
rm -fr out
|
||||||
mkdir out
|
mkdir out
|
||||||
|
|
||||||
../../xml2txt $ds
|
xml2txt $ds
|
||||||
|
|
||||||
rm -f $corpus.lines
|
rm -f $corpus.lines
|
||||||
for i in out/*.txt
|
for i in out/*.txt
|
||||||
@@ -53,14 +58,14 @@ cd out
|
|||||||
mkdir xml
|
mkdir xml
|
||||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||||
|
|
||||||
../../../metadata
|
metadata
|
||||||
|
|
||||||
cd xml
|
cd xml
|
||||||
rm -f $corpus.data.dz $corpus.index
|
rm -f $corpus.data.dz $corpus.index
|
||||||
alto -q -o $corpus.data.dz *.xml
|
alto -q -o $corpus.data.dz *.xml
|
||||||
|
|
||||||
# telling per bericht, niet per zin
|
# telling per bericht, niet per zin
|
||||||
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||||
|
|
||||||
cd ../..
|
cd ../..
|
||||||
rm -fr out
|
rm -fr out
|
||||||
|
|||||||
@@ -102,7 +102,7 @@ func main() {
|
|||||||
}
|
}
|
||||||
p(err)
|
p(err)
|
||||||
year, week := t.ISOWeek()
|
year, week := t.ISOWeek()
|
||||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Parool/%d/%02d", year, week)
|
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Parool/%d/w%02d", year, week)
|
||||||
if exists(dirname + "/lock") {
|
if exists(dirname + "/lock") {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,17 +2,20 @@
|
|||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
|
BASE=/net/corpora/nlnieuws
|
||||||
|
PART=$BASE/Parool
|
||||||
|
|
||||||
unset CDPATH
|
unset CDPATH
|
||||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
|
||||||
export TZ=Europe/Amsterdam
|
export TZ=Europe/Amsterdam
|
||||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||||
|
|
||||||
if [ "$1" = "" ]
|
if [ "$1" = "" ]
|
||||||
then
|
then
|
||||||
ds=`date -d -7days +%G-%V`
|
ds=`date -d -7days +%G.%V`
|
||||||
else
|
else
|
||||||
case "$1" in
|
case "$1" in
|
||||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
2[0-9][0-9][0-9].[0-5][0-9])
|
||||||
ds=$1
|
ds=$1
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
@@ -22,11 +25,13 @@ else
|
|||||||
esac
|
esac
|
||||||
fi
|
fi
|
||||||
|
|
||||||
dp=${ds//-//}
|
year=${ds%.*}
|
||||||
|
week=${ds#*.}
|
||||||
|
dp=$year/w$week
|
||||||
|
corpus=$PART/corpus/$year/$ds
|
||||||
|
mkdir -p $PART/corpus/$year
|
||||||
|
|
||||||
corpus=/net/corpora/nlnieuws/Parool/corpus/$ds
|
cd $PART/$dp
|
||||||
|
|
||||||
cd /net/corpora/nlnieuws/Parool/$dp
|
|
||||||
|
|
||||||
ln -s lock.$$ lock
|
ln -s lock.$$ lock
|
||||||
if [ "`readlink lock`" != lock.$$ ]
|
if [ "`readlink lock`" != lock.$$ ]
|
||||||
@@ -51,14 +56,14 @@ cd out
|
|||||||
mkdir xml
|
mkdir xml
|
||||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||||
|
|
||||||
../../../metadata
|
metadata
|
||||||
|
|
||||||
cd xml
|
cd xml
|
||||||
rm -f $corpus.data.dz $corpus.index
|
rm -f $corpus.data.dz $corpus.index
|
||||||
alto -q -o $corpus.data.dz *.xml
|
alto -q -o $corpus.data.dz *.xml
|
||||||
|
|
||||||
# telling per bericht, niet per zin
|
# telling per bericht, niet per zin
|
||||||
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||||
|
|
||||||
cd ../..
|
cd ../..
|
||||||
rm -fr out
|
rm -fr out
|
||||||
|
|||||||
@@ -77,7 +77,7 @@ func main() {
|
|||||||
}
|
}
|
||||||
p(err)
|
p(err)
|
||||||
year, week := t.ISOWeek()
|
year, week := t.ISOWeek()
|
||||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/RO/%d/%02d", year, week)
|
dirname := fmt.Sprintf("/net/corpora/nlnieuws/RO/%d/w%02d", year, week)
|
||||||
if exists(dirname + "/lock") {
|
if exists(dirname + "/lock") {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -26,7 +26,7 @@ var (
|
|||||||
x = e.ExitErr
|
x = e.ExitErr
|
||||||
w = e.WarnErr
|
w = e.WarnErr
|
||||||
|
|
||||||
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`)
|
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]\.[0-5][0-9]$`)
|
||||||
)
|
)
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
@@ -38,16 +38,16 @@ func main() {
|
|||||||
switch len(os.Args) {
|
switch len(os.Args) {
|
||||||
case 1:
|
case 1:
|
||||||
year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
|
year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
|
||||||
ds = fmt.Sprintf("%d-%02d", year, week)
|
ds = fmt.Sprintf("%d.%02d", year, week)
|
||||||
case 2:
|
case 2:
|
||||||
if !reYearWeek.MatchString(os.Args[1]) {
|
if !reYearWeek.MatchString(os.Args[1]) {
|
||||||
x(fmt.Errorf("arg must be yyyy-ww"))
|
x(fmt.Errorf("arg must be yyyy.ww"))
|
||||||
}
|
}
|
||||||
ds = os.Args[1]
|
ds = os.Args[1]
|
||||||
default:
|
default:
|
||||||
x(fmt.Errorf("too many arguments"))
|
x(fmt.Errorf("too many arguments"))
|
||||||
}
|
}
|
||||||
dp := ds[:4] + "/" + ds[5:]
|
dp := ds[:4] + "/w" + ds[5:]
|
||||||
|
|
||||||
x(os.Chdir("/net/corpora/nlnieuws/RO/" + dp))
|
x(os.Chdir("/net/corpora/nlnieuws/RO/" + dp))
|
||||||
x(os.MkdirAll("out", 0777))
|
x(os.MkdirAll("out", 0777))
|
||||||
|
|||||||
@@ -2,17 +2,20 @@
|
|||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
|
BASE=/net/corpora/nlnieuws
|
||||||
|
PART=$BASE/RO
|
||||||
|
|
||||||
unset CDPATH
|
unset CDPATH
|
||||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
|
||||||
export TZ=Europe/Amsterdam
|
export TZ=Europe/Amsterdam
|
||||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||||
|
|
||||||
if [ "$1" = "" ]
|
if [ "$1" = "" ]
|
||||||
then
|
then
|
||||||
ds=`date -d -7days +%G-%V`
|
ds=`date -d -7days +%G.%V`
|
||||||
else
|
else
|
||||||
case "$1" in
|
case "$1" in
|
||||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
2[0-9][0-9][0-9].[0-5][0-9])
|
||||||
ds=$1
|
ds=$1
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
@@ -22,11 +25,13 @@ else
|
|||||||
esac
|
esac
|
||||||
fi
|
fi
|
||||||
|
|
||||||
dp=${ds//-//}
|
year=${ds%.*}
|
||||||
|
week=${ds#*.}
|
||||||
|
dp=$year/w$week
|
||||||
|
corpus=$PART/corpus/$year/$ds
|
||||||
|
mkdir -p $PART/corpus/$year
|
||||||
|
|
||||||
corpus=/net/corpora/nlnieuws/RO/corpus/$ds
|
cd $PART/$dp
|
||||||
|
|
||||||
cd /net/corpora/nlnieuws/RO/$dp
|
|
||||||
|
|
||||||
ln -s lock.$$ lock
|
ln -s lock.$$ lock
|
||||||
if [ "`readlink lock`" != lock.$$ ]
|
if [ "`readlink lock`" != lock.$$ ]
|
||||||
@@ -38,7 +43,7 @@ fi
|
|||||||
rm -fr out
|
rm -fr out
|
||||||
mkdir out
|
mkdir out
|
||||||
|
|
||||||
../../xml2txt $ds
|
xml2txt $ds
|
||||||
|
|
||||||
rm -f $corpus.lines
|
rm -f $corpus.lines
|
||||||
for i in out/*.txt
|
for i in out/*.txt
|
||||||
@@ -53,14 +58,14 @@ cd out
|
|||||||
mkdir xml
|
mkdir xml
|
||||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||||
|
|
||||||
../../../metadata
|
metadata
|
||||||
|
|
||||||
cd xml
|
cd xml
|
||||||
rm -f $corpus.data.dz $corpus.index
|
rm -f $corpus.data.dz $corpus.index
|
||||||
alto -q -o $corpus.data.dz *.xml
|
alto -q -o $corpus.data.dz *.xml
|
||||||
|
|
||||||
# telling per bericht, niet per zin
|
# telling per bericht, niet per zin
|
||||||
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||||
|
|
||||||
cd ../..
|
cd ../..
|
||||||
rm -fr out
|
rm -fr out
|
||||||
|
|||||||
@@ -101,7 +101,7 @@ func main() {
|
|||||||
}
|
}
|
||||||
p(err)
|
p(err)
|
||||||
year, week := t.ISOWeek()
|
year, week := t.ISOWeek()
|
||||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/RTVNoord/%d/%02d", year, week)
|
dirname := fmt.Sprintf("/net/corpora/nlnieuws/RTVNoord/%d/w%02d", year, week)
|
||||||
if exists(dirname + "/lock") {
|
if exists(dirname + "/lock") {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,17 +2,20 @@
|
|||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
|
BASE=/net/corpora/nlnieuws
|
||||||
|
PART=$BASE/RTVNoord
|
||||||
|
|
||||||
unset CDPATH
|
unset CDPATH
|
||||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
|
||||||
export TZ=Europe/Amsterdam
|
export TZ=Europe/Amsterdam
|
||||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||||
|
|
||||||
if [ "$1" = "" ]
|
if [ "$1" = "" ]
|
||||||
then
|
then
|
||||||
ds=`date -d -7days +%G-%V`
|
ds=`date -d -7days +%G.%V`
|
||||||
else
|
else
|
||||||
case "$1" in
|
case "$1" in
|
||||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
2[0-9][0-9][0-9].[0-5][0-9])
|
||||||
ds=$1
|
ds=$1
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
@@ -22,11 +25,13 @@ else
|
|||||||
esac
|
esac
|
||||||
fi
|
fi
|
||||||
|
|
||||||
dp=${ds//-//}
|
year=${ds%.*}
|
||||||
|
week=${ds#*.}
|
||||||
|
dp=$year/w$week
|
||||||
|
corpus=$PART/corpus/$year/$ds
|
||||||
|
mkdir -p $PART/corpus/$year
|
||||||
|
|
||||||
corpus=/net/corpora/nlnieuws/RTVNoord/corpus/$ds
|
cd $PART/$dp
|
||||||
|
|
||||||
cd /net/corpora/nlnieuws/RTVNoord/$dp
|
|
||||||
|
|
||||||
ln -s lock.$$ lock
|
ln -s lock.$$ lock
|
||||||
if [ "`readlink lock`" != lock.$$ ]
|
if [ "`readlink lock`" != lock.$$ ]
|
||||||
@@ -51,15 +56,15 @@ cd out
|
|||||||
mkdir xml
|
mkdir xml
|
||||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||||
|
|
||||||
../../../metadata
|
metadata
|
||||||
|
|
||||||
cd xml
|
cd xml
|
||||||
rm -f $corpus.data.dz $corpus.index
|
rm -f $corpus.data.dz $corpus.index
|
||||||
alto -q -o $corpus.data.dz *.xml
|
alto -q -o $corpus.data.dz *.xml
|
||||||
|
|
||||||
# telling per bericht, niet per zin
|
# telling per bericht, niet per zin
|
||||||
/net/corpora/nlnieuws/namen.sh -x C -s $corpus.data.dz > $corpus.cat.txt
|
query.sh -x C -s $corpus.data.dz > $corpus.cat.txt
|
||||||
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||||
|
|
||||||
cd ../..
|
cd ../..
|
||||||
rm -fr out
|
rm -fr out
|
||||||
|
|||||||
@@ -77,7 +77,7 @@ func main() {
|
|||||||
}
|
}
|
||||||
p(err)
|
p(err)
|
||||||
year, week := t.ISOWeek()
|
year, week := t.ISOWeek()
|
||||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Sargasso/%d/%02d", year, week)
|
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Sargasso/%d/w%02d", year, week)
|
||||||
if exists(dirname + "/lock") {
|
if exists(dirname + "/lock") {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ type Item struct {
|
|||||||
var (
|
var (
|
||||||
x = e.ExitErr
|
x = e.ExitErr
|
||||||
|
|
||||||
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`)
|
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]\.[0-5][0-9]$`)
|
||||||
)
|
)
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
@@ -32,16 +32,16 @@ func main() {
|
|||||||
switch len(os.Args) {
|
switch len(os.Args) {
|
||||||
case 1:
|
case 1:
|
||||||
year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
|
year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
|
||||||
ds = fmt.Sprintf("%d-%02d", year, week)
|
ds = fmt.Sprintf("%d.%02d", year, week)
|
||||||
case 2:
|
case 2:
|
||||||
if !reYearWeek.MatchString(os.Args[1]) {
|
if !reYearWeek.MatchString(os.Args[1]) {
|
||||||
x(fmt.Errorf("arg must be yyyy-ww"))
|
x(fmt.Errorf("arg must be yyyy.ww"))
|
||||||
}
|
}
|
||||||
ds = os.Args[1]
|
ds = os.Args[1]
|
||||||
default:
|
default:
|
||||||
x(fmt.Errorf("too many arguments"))
|
x(fmt.Errorf("too many arguments"))
|
||||||
}
|
}
|
||||||
dp := ds[:4] + "/" + ds[5:]
|
dp := ds[:4] + "/w" + ds[5:]
|
||||||
|
|
||||||
x(os.Chdir("/net/corpora/nlnieuws/Sargasso/" + dp))
|
x(os.Chdir("/net/corpora/nlnieuws/Sargasso/" + dp))
|
||||||
x(os.MkdirAll("out", 0777))
|
x(os.MkdirAll("out", 0777))
|
||||||
|
|||||||
@@ -2,17 +2,20 @@
|
|||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
|
BASE=/net/corpora/nlnieuws
|
||||||
|
PART=$BASE/Sargasso
|
||||||
|
|
||||||
unset CDPATH
|
unset CDPATH
|
||||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
|
||||||
export TZ=Europe/Amsterdam
|
export TZ=Europe/Amsterdam
|
||||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||||
|
|
||||||
if [ "$1" = "" ]
|
if [ "$1" = "" ]
|
||||||
then
|
then
|
||||||
ds=`date -d -7days +%G-%V`
|
ds=`date -d -7days +%G.%V`
|
||||||
else
|
else
|
||||||
case "$1" in
|
case "$1" in
|
||||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
2[0-9][0-9][0-9].[0-5][0-9])
|
||||||
ds=$1
|
ds=$1
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
@@ -22,11 +25,13 @@ else
|
|||||||
esac
|
esac
|
||||||
fi
|
fi
|
||||||
|
|
||||||
dp=${ds//-//}
|
year=${ds%.*}
|
||||||
|
week=${ds#*.}
|
||||||
|
dp=$year/w$week
|
||||||
|
corpus=$PART/corpus/$year/$ds
|
||||||
|
mkdir -p $PART/corpus/$year
|
||||||
|
|
||||||
corpus=/net/corpora/nlnieuws/Sargasso/corpus/$ds
|
cd $PART/$dp
|
||||||
|
|
||||||
cd /net/corpora/nlnieuws/Sargasso/$dp
|
|
||||||
|
|
||||||
ln -s lock.$$ lock
|
ln -s lock.$$ lock
|
||||||
if [ "`readlink lock`" != lock.$$ ]
|
if [ "`readlink lock`" != lock.$$ ]
|
||||||
@@ -38,7 +43,7 @@ fi
|
|||||||
rm -fr out
|
rm -fr out
|
||||||
mkdir out
|
mkdir out
|
||||||
|
|
||||||
../../xml2txt $ds
|
xml2txt $ds
|
||||||
|
|
||||||
rm -f $corpus.lines
|
rm -f $corpus.lines
|
||||||
for i in out/*.txt
|
for i in out/*.txt
|
||||||
@@ -53,14 +58,14 @@ cd out
|
|||||||
mkdir xml
|
mkdir xml
|
||||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||||
|
|
||||||
../../../metadata
|
metadata
|
||||||
|
|
||||||
cd xml
|
cd xml
|
||||||
rm -f $corpus.data.dz $corpus.index
|
rm -f $corpus.data.dz $corpus.index
|
||||||
alto -q -o $corpus.data.dz *.xml
|
alto -q -o $corpus.data.dz *.xml
|
||||||
|
|
||||||
# telling per bericht, niet per zin
|
# telling per bericht, niet per zin
|
||||||
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||||
|
|
||||||
cd ../..
|
cd ../..
|
||||||
rm -fr out
|
rm -fr out
|
||||||
|
|||||||
@@ -90,7 +90,7 @@ func main() {
|
|||||||
}
|
}
|
||||||
p(err)
|
p(err)
|
||||||
year, week := t.ISOWeek()
|
year, week := t.ISOWeek()
|
||||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Sikkom/%d/%02d", year, week)
|
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Sikkom/%d/w%02d", year, week)
|
||||||
if exists(dirname + "/lock") {
|
if exists(dirname + "/lock") {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,17 +2,20 @@
|
|||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
|
BASE=/net/corpora/nlnieuws
|
||||||
|
PART=$BASE/Sikkom
|
||||||
|
|
||||||
unset CDPATH
|
unset CDPATH
|
||||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
|
||||||
export TZ=Europe/Amsterdam
|
export TZ=Europe/Amsterdam
|
||||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||||
|
|
||||||
if [ "$1" = "" ]
|
if [ "$1" = "" ]
|
||||||
then
|
then
|
||||||
ds=`date -d -7days +%G-%V`
|
ds=`date -d -7days +%G.%V`
|
||||||
else
|
else
|
||||||
case "$1" in
|
case "$1" in
|
||||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
2[0-9][0-9][0-9].[0-5][0-9])
|
||||||
ds=$1
|
ds=$1
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
@@ -22,11 +25,13 @@ else
|
|||||||
esac
|
esac
|
||||||
fi
|
fi
|
||||||
|
|
||||||
dp=${ds//-//}
|
year=${ds%.*}
|
||||||
|
week=${ds#*.}
|
||||||
|
dp=$year/w$week
|
||||||
|
corpus=$PART/corpus/$year/$ds
|
||||||
|
mkdir -p $PART/corpus/$year
|
||||||
|
|
||||||
corpus=/net/corpora/nlnieuws/Sikkom/corpus/$ds
|
cd $PART/$dp
|
||||||
|
|
||||||
cd /net/corpora/nlnieuws/Sikkom/$dp
|
|
||||||
|
|
||||||
ln -s lock.$$ lock
|
ln -s lock.$$ lock
|
||||||
if [ "`readlink lock`" != lock.$$ ]
|
if [ "`readlink lock`" != lock.$$ ]
|
||||||
@@ -51,7 +56,7 @@ cd out
|
|||||||
mkdir xml
|
mkdir xml
|
||||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||||
|
|
||||||
../../../metadata
|
metadata
|
||||||
|
|
||||||
cd xml
|
cd xml
|
||||||
rm -f $corpus.data.dz $corpus.index
|
rm -f $corpus.data.dz $corpus.index
|
||||||
|
|||||||
@@ -77,7 +77,7 @@ func main() {
|
|||||||
}
|
}
|
||||||
p(err)
|
p(err)
|
||||||
year, week := t.ISOWeek()
|
year, week := t.ISOWeek()
|
||||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Tzum/%d/%02d", year, week)
|
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Tzum/%d/w%02d", year, week)
|
||||||
if exists(dirname + "/lock") {
|
if exists(dirname + "/lock") {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ type Item struct {
|
|||||||
var (
|
var (
|
||||||
x = e.ExitErr
|
x = e.ExitErr
|
||||||
|
|
||||||
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`)
|
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]\.[0-5][0-9]$`)
|
||||||
)
|
)
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
@@ -32,16 +32,16 @@ func main() {
|
|||||||
switch len(os.Args) {
|
switch len(os.Args) {
|
||||||
case 1:
|
case 1:
|
||||||
year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
|
year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
|
||||||
ds = fmt.Sprintf("%d-%02d", year, week)
|
ds = fmt.Sprintf("%d.%02d", year, week)
|
||||||
case 2:
|
case 2:
|
||||||
if !reYearWeek.MatchString(os.Args[1]) {
|
if !reYearWeek.MatchString(os.Args[1]) {
|
||||||
x(fmt.Errorf("arg must be yyyy-ww"))
|
x(fmt.Errorf("arg must be yyyy.ww"))
|
||||||
}
|
}
|
||||||
ds = os.Args[1]
|
ds = os.Args[1]
|
||||||
default:
|
default:
|
||||||
x(fmt.Errorf("too many arguments"))
|
x(fmt.Errorf("too many arguments"))
|
||||||
}
|
}
|
||||||
dp := ds[:4] + "/" + ds[5:]
|
dp := ds[:4] + "/w" + ds[5:]
|
||||||
|
|
||||||
x(os.Chdir("/net/corpora/nlnieuws/Tzum/" + dp))
|
x(os.Chdir("/net/corpora/nlnieuws/Tzum/" + dp))
|
||||||
x(os.MkdirAll("out", 0777))
|
x(os.MkdirAll("out", 0777))
|
||||||
|
|||||||
@@ -2,17 +2,20 @@
|
|||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
|
BASE=/net/corpora/nlnieuws
|
||||||
|
PART=$BASE/Tzum
|
||||||
|
|
||||||
unset CDPATH
|
unset CDPATH
|
||||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
|
||||||
export TZ=Europe/Amsterdam
|
export TZ=Europe/Amsterdam
|
||||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||||
|
|
||||||
if [ "$1" = "" ]
|
if [ "$1" = "" ]
|
||||||
then
|
then
|
||||||
ds=`date -d -7days +%G-%V`
|
ds=`date -d -7days +%G.%V`
|
||||||
else
|
else
|
||||||
case "$1" in
|
case "$1" in
|
||||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
2[0-9][0-9][0-9].[0-5][0-9])
|
||||||
ds=$1
|
ds=$1
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
@@ -22,11 +25,13 @@ else
|
|||||||
esac
|
esac
|
||||||
fi
|
fi
|
||||||
|
|
||||||
dp=${ds//-//}
|
year=${ds%.*}
|
||||||
|
week=${ds#*.}
|
||||||
|
dp=$year/w$week
|
||||||
|
corpus=$PART/corpus/$year/$ds
|
||||||
|
mkdir -p $PART/corpus/$year
|
||||||
|
|
||||||
corpus=/net/corpora/nlnieuws/Tzum/corpus/$ds
|
cd $PART/$dp
|
||||||
|
|
||||||
cd /net/corpora/nlnieuws/Tzum/$dp
|
|
||||||
|
|
||||||
ln -s lock.$$ lock
|
ln -s lock.$$ lock
|
||||||
if [ "`readlink lock`" != lock.$$ ]
|
if [ "`readlink lock`" != lock.$$ ]
|
||||||
@@ -38,7 +43,7 @@ fi
|
|||||||
rm -fr out
|
rm -fr out
|
||||||
mkdir out
|
mkdir out
|
||||||
|
|
||||||
../../xml2txt $ds
|
xml2txt $ds
|
||||||
|
|
||||||
rm -f $corpus.lines
|
rm -f $corpus.lines
|
||||||
for i in out/*.txt
|
for i in out/*.txt
|
||||||
@@ -53,14 +58,14 @@ cd out
|
|||||||
mkdir xml
|
mkdir xml
|
||||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||||
|
|
||||||
../../../metadata
|
metadata
|
||||||
|
|
||||||
cd xml
|
cd xml
|
||||||
rm -f $corpus.data.dz $corpus.index
|
rm -f $corpus.data.dz $corpus.index
|
||||||
alto -q -o $corpus.data.dz *.xml
|
alto -q -o $corpus.data.dz *.xml
|
||||||
|
|
||||||
# telling per bericht, niet per zin
|
# telling per bericht, niet per zin
|
||||||
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||||
|
|
||||||
cd ../..
|
cd ../..
|
||||||
rm -fr out
|
rm -fr out
|
||||||
|
|||||||
@@ -110,8 +110,7 @@ func main() {
|
|||||||
if t2.After(t) {
|
if t2.After(t) {
|
||||||
t = t2
|
t = t2
|
||||||
}
|
}
|
||||||
year, week := t.ISOWeek()
|
dirname := fmt.Sprintf("/net/corpora/nlnieuws/VRT/%d/%02d/%02d", t.Year(), int(t.Month()), t.Day())
|
||||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/VRT/%d/%02d", year, week)
|
|
||||||
if exists(dirname + "/lock") {
|
if exists(dirname + "/lock") {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,17 +2,20 @@
|
|||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
|
BASE=/net/corpora/nlnieuws
|
||||||
|
PART=$BASE/VRT
|
||||||
|
|
||||||
unset CDPATH
|
unset CDPATH
|
||||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
|
||||||
export TZ=Europe/Amsterdam
|
export TZ=Europe/Amsterdam
|
||||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||||
|
|
||||||
if [ "$1" = "" ]
|
if [ "$1" = "" ]
|
||||||
then
|
then
|
||||||
ds=`date -d -7days +%G-%V`
|
ds=`date -d -2days +%Y-%m-%d`
|
||||||
else
|
else
|
||||||
case "$1" in
|
case "$1" in
|
||||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9])
|
||||||
ds=$1
|
ds=$1
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
@@ -23,10 +26,11 @@ else
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
dp=${ds//-//}
|
dp=${ds//-//}
|
||||||
|
year=${ds%%-*}
|
||||||
|
corpus=$PART/corpus/$year/$ds
|
||||||
|
mkdir -p $PART/corpus/$year
|
||||||
|
|
||||||
corpus=/net/corpora/nlnieuws/VRT/corpus/$ds
|
cd $PART/$dp
|
||||||
|
|
||||||
cd /net/corpora/nlnieuws/VRT/$dp
|
|
||||||
|
|
||||||
ln -s lock.$$ lock
|
ln -s lock.$$ lock
|
||||||
if [ "`readlink lock`" != lock.$$ ]
|
if [ "`readlink lock`" != lock.$$ ]
|
||||||
@@ -51,15 +55,15 @@ cd out
|
|||||||
mkdir xml
|
mkdir xml
|
||||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||||
|
|
||||||
../../../metadata
|
metadata
|
||||||
|
|
||||||
cd xml
|
cd xml
|
||||||
rm -f $corpus.data.dz $corpus.index
|
rm -f $corpus.data.dz $corpus.index
|
||||||
alto -q -o $corpus.data.dz *.xml
|
alto -q -o $corpus.data.dz *.xml
|
||||||
|
|
||||||
# telling per bericht, niet per zin
|
# telling per bericht, niet per zin
|
||||||
/net/corpora/nlnieuws/namen.sh -x C -s $corpus.data.dz > $corpus.cat.txt
|
query.sh -x C -s $corpus.data.dz > $corpus.cat.txt
|
||||||
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||||
|
|
||||||
cd ../..
|
cd ../..
|
||||||
rm -fr out
|
rm -fr out
|
||||||
|
|||||||
@@ -70,9 +70,9 @@ var (
|
|||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
|
|
||||||
aa := strings.Split(os.Args[1], "-")
|
aa := strings.Split(os.Args[1], ".")
|
||||||
if len(aa) != 2 {
|
if len(aa) != 2 {
|
||||||
x(fmt.Errorf("ongeldig argument, moet in formaat yyyy-dd zijn"))
|
x(fmt.Errorf("ongeldig argument, moet in formaat yyyy.dd zijn"))
|
||||||
}
|
}
|
||||||
|
|
||||||
var err error
|
var err error
|
||||||
@@ -128,7 +128,8 @@ func makeParts(source string) *Parts {
|
|||||||
func makeValues(source, part string) [][5]any {
|
func makeValues(source, part string) [][5]any {
|
||||||
v := make([][5]any, 0)
|
v := make([][5]any, 0)
|
||||||
|
|
||||||
filename := fmt.Sprintf("/net/corpora/nlnieuws/data/%s-%s-%d-%02d-%d%s",
|
filename := fmt.Sprintf("/net/corpora/nlnieuws/data/%d/%s-%s-%d-%02d-%d%s",
|
||||||
|
year,
|
||||||
sources[source],
|
sources[source],
|
||||||
parts[part].file,
|
parts[part].file,
|
||||||
year,
|
year,
|
||||||
@@ -226,7 +227,7 @@ func dates() (start, first, last string, names []string) {
|
|||||||
t3 := tStart
|
t3 := tStart
|
||||||
for range size {
|
for range size {
|
||||||
y, w := t3.ISOWeek()
|
y, w := t3.ISOWeek()
|
||||||
names = append(names, fmt.Sprintf("%d/%02d", y, w))
|
names = append(names, fmt.Sprintf("%d/w%02d", y, w))
|
||||||
t3 = t3.AddDate(0, 0, 7)
|
t3 = t3.AddDate(0, 0, 7)
|
||||||
}
|
}
|
||||||
t3 = tStart
|
t3 = tStart
|
||||||
|
|||||||
@@ -26,13 +26,24 @@ var (
|
|||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
|
|
||||||
files, err := os.ReadDir("/net/corpora/nlnieuws/data/json")
|
dirs, err := os.ReadDir("/net/corpora/nlnieuws/data/json")
|
||||||
x(err)
|
x(err)
|
||||||
|
for _, dir := range dirs {
|
||||||
|
if !dir.IsDir() {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
dirname := dir.Name()
|
||||||
|
if dirname[0] != '2' {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
files, err := os.ReadDir("/net/corpora/nlnieuws/data/json/" + dirname)
|
||||||
|
x(err)
|
||||||
|
|
||||||
for _, file := range files {
|
for _, file := range files {
|
||||||
filename := file.Name()
|
filename := file.Name()
|
||||||
if strings.HasPrefix(filename, "DATA-") && strings.HasSuffix(filename, "-4.json") {
|
if strings.HasPrefix(filename, "DATA-") && strings.HasSuffix(filename, "-4.json") {
|
||||||
addWeek(filename[5:12])
|
addWeek(filename[5:12])
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -15,9 +15,9 @@ var (
|
|||||||
)
|
)
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
aa := strings.Split(os.Args[1], "-")
|
aa := strings.Split(os.Args[1], ".")
|
||||||
if len(aa) != 2 {
|
if len(aa) != 2 {
|
||||||
x(fmt.Errorf("ongeldig argument, moet in formaat yyyy-dd zijn"))
|
x(fmt.Errorf("ongeldig argument, moet in formaat yyyy.dd zijn"))
|
||||||
}
|
}
|
||||||
|
|
||||||
year, err := strconv.Atoi(aa[0])
|
year, err := strconv.Atoi(aa[0])
|
||||||
@@ -58,7 +58,7 @@ func main() {
|
|||||||
fmt.Print(" -or")
|
fmt.Print(" -or")
|
||||||
}
|
}
|
||||||
y, w := t2.ISOWeek()
|
y, w := t2.ISOWeek()
|
||||||
fmt.Printf(" -name %d-%02d.data.dz", y, w)
|
fmt.Printf(" -name %d.%02d.data.dz", y, w)
|
||||||
t2 = t2.AddDate(0, 0, 7)
|
t2 = t2.AddDate(0, 0, 7)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
20
collect.sh
20
collect.sh
@@ -22,10 +22,10 @@ say () {
|
|||||||
|
|
||||||
if [ "$1" = "" ]
|
if [ "$1" = "" ]
|
||||||
then
|
then
|
||||||
ds=`date -d -7days +%G-%V`
|
ds=`date -d -7days +%G.%V`
|
||||||
else
|
else
|
||||||
case "$1" in
|
case "$1" in
|
||||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
2[0-9][0-9][0-9].[0-5][0-9])
|
||||||
ds=$1
|
ds=$1
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
@@ -35,7 +35,11 @@ else
|
|||||||
esac
|
esac
|
||||||
fi
|
fi
|
||||||
|
|
||||||
cd /net/corpora/nlnieuws/data
|
year=${ds%%.*}
|
||||||
|
|
||||||
|
mkdir -p /net/corpora/nlnieuws/data/$year
|
||||||
|
mkdir -p /net/corpora/nlnieuws/data/json/$year
|
||||||
|
cd /net/corpora/nlnieuws/data/$year
|
||||||
|
|
||||||
declare -A parts
|
declare -A parts
|
||||||
#parts[alles]='.'
|
#parts[alles]='.'
|
||||||
@@ -68,7 +72,7 @@ do
|
|||||||
|
|
||||||
for i in 1 4
|
for i in 1 4
|
||||||
do
|
do
|
||||||
files=$(find .. $(week2files $ds $i) | grep -E "$regex") || true
|
files=$(find ../.. $(week2files $ds $i) | grep -E "$regex") || true
|
||||||
if [ -z "$files" ]
|
if [ -z "$files" ]
|
||||||
then
|
then
|
||||||
continue
|
continue
|
||||||
@@ -161,10 +165,10 @@ do
|
|||||||
done
|
done
|
||||||
done
|
done
|
||||||
|
|
||||||
data2json $ds 1 > json/DATA-$ds-1.json
|
data2json $ds 1 > ../json/$year/DATA-$ds-1.json
|
||||||
data2json $ds 4 > json/DATA-$ds-4.json
|
data2json $ds 4 > ../json/$year/DATA-$ds-4.json
|
||||||
dates2json > json/index1.json
|
dates2json > ../json/index1.json
|
||||||
dates2json > json/index4.json
|
dates2json > ../json/index4.json
|
||||||
|
|
||||||
# rechten bijwerken
|
# rechten bijwerken
|
||||||
chmod -R g+w /net/corpora/nlnieuws
|
chmod -R g+w /net/corpora/nlnieuws
|
||||||
|
|||||||
@@ -17,8 +17,11 @@ var (
|
|||||||
|
|
||||||
func AddEnd(s string) string {
|
func AddEnd(s string) string {
|
||||||
s = strings.TrimSpace(s)
|
s = strings.TrimSpace(s)
|
||||||
if s == "" || reEOL.MatchString(s) {
|
if s == "" {
|
||||||
return s
|
return ""
|
||||||
|
}
|
||||||
|
if reEOL.MatchString(s) {
|
||||||
|
return s + "\n"
|
||||||
}
|
}
|
||||||
return s + ".\n"
|
return s + ".\n"
|
||||||
}
|
}
|
||||||
|
|||||||
12
www/app.js
12
www/app.js
@@ -133,7 +133,9 @@ function makeTD(title, values) {
|
|||||||
|
|
||||||
async function loadSource(source, week) {
|
async function loadSource(source, week) {
|
||||||
if (!data.has(week)) {
|
if (!data.has(week)) {
|
||||||
data[week] = await getJSON('DATA-' + week + '-4.json')
|
data[week] = await getJSON(
|
||||||
|
week.substring(0, 4) + '/DATA-' + week + '-4.json',
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
idSubtitle.innerHTML = source + ' — t/m ' + data[week].last
|
idSubtitle.innerHTML = source + ' — t/m ' + data[week].last
|
||||||
@@ -154,7 +156,9 @@ async function loadSource(source, week) {
|
|||||||
|
|
||||||
async function loadPart(part, week) {
|
async function loadPart(part, week) {
|
||||||
if (!data.has(week)) {
|
if (!data.has(week)) {
|
||||||
data[week] = await getJSON('DATA-' + week + '-4.json')
|
data[week] = await getJSON(
|
||||||
|
week.substring(0, 4) + '/DATA-' + week + '-4.json',
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
idSubtitle.innerHTML = part + ' — t/m ' + data[week].last
|
idSubtitle.innerHTML = part + ' — t/m ' + data[week].last
|
||||||
@@ -185,7 +189,9 @@ async function loadWeken(source, part) {
|
|||||||
if (i < dates.length) {
|
if (i < dates.length) {
|
||||||
var week = dates[i].week
|
var week = dates[i].week
|
||||||
if (!data.has(week)) {
|
if (!data.has(week)) {
|
||||||
data[week] = await getJSON('DATA-' + week + '-4.json')
|
data[week] = await getJSON(
|
||||||
|
week.substring(0, 4) + '/DATA-' + week + '-4.json',
|
||||||
|
)
|
||||||
}
|
}
|
||||||
var values = data[week][source][part]
|
var values = data[week][source][part]
|
||||||
tr.appendChild(makeTD('t/m ' + data[week].last, values))
|
tr.appendChild(makeTD('t/m ' + data[week].last, values))
|
||||||
|
|||||||
Reference in New Issue
Block a user