grote reorganisatie:
- HLN, NOS, NU, VRT: per week -> per dag - yyyy-ww -> yyyy.ww - yyyy* -> yyyy/yyyy* etc
This commit is contained in:
@@ -77,7 +77,7 @@ func main() {
|
||||
}
|
||||
p(err)
|
||||
year, week := t.ISOWeek()
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/AT5/%d/%02d", year, week)
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/AT5/%d/w%02d", year, week)
|
||||
if exists(dirname + "/lock") {
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -22,7 +22,7 @@ type Item struct {
|
||||
var (
|
||||
x = e.ExitErr
|
||||
|
||||
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`)
|
||||
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]\.[0-5][0-9]$`)
|
||||
)
|
||||
|
||||
func main() {
|
||||
@@ -31,16 +31,16 @@ func main() {
|
||||
switch len(os.Args) {
|
||||
case 1:
|
||||
year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
|
||||
ds = fmt.Sprintf("%d-%02d", year, week)
|
||||
ds = fmt.Sprintf("%d.%02d", year, week)
|
||||
case 2:
|
||||
if !reYearWeek.MatchString(os.Args[1]) {
|
||||
x(fmt.Errorf("arg must be yyyy-ww"))
|
||||
x(fmt.Errorf("arg must be yyyy.ww"))
|
||||
}
|
||||
ds = os.Args[1]
|
||||
default:
|
||||
x(fmt.Errorf("too many arguments"))
|
||||
}
|
||||
dp := ds[:4] + "/" + ds[5:]
|
||||
dp := ds[:4] + "/w" + ds[5:]
|
||||
|
||||
x(os.Chdir("/net/corpora/nlnieuws/AT5/" + dp))
|
||||
x(os.MkdirAll("out", 0777))
|
||||
|
||||
@@ -2,17 +2,20 @@
|
||||
|
||||
set -e
|
||||
|
||||
BASE=/net/corpora/nlnieuws
|
||||
PART=$BASE/AT5
|
||||
|
||||
unset CDPATH
|
||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
||||
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
|
||||
export TZ=Europe/Amsterdam
|
||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||
|
||||
if [ "$1" = "" ]
|
||||
then
|
||||
ds=`date -d -7days +%G-%V`
|
||||
ds=`date -d -7days +%G.%V`
|
||||
else
|
||||
case "$1" in
|
||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
||||
2[0-9][0-9][0-9].[0-5][0-9])
|
||||
ds=$1
|
||||
;;
|
||||
*)
|
||||
@@ -22,11 +25,13 @@ else
|
||||
esac
|
||||
fi
|
||||
|
||||
dp=${ds//-//}
|
||||
year=${ds%.*}
|
||||
week=${ds#*.}
|
||||
dp=$year/w$week
|
||||
corpus=$PART/corpus/$year/$ds
|
||||
mkdir -p $PART/corpus/$year
|
||||
|
||||
corpus=/net/corpora/nlnieuws/AT5/corpus/$ds
|
||||
|
||||
cd /net/corpora/nlnieuws/AT5/$dp
|
||||
cd $PART/$dp
|
||||
|
||||
ln -s lock.$$ lock
|
||||
if [ "`readlink lock`" != lock.$$ ]
|
||||
@@ -38,7 +43,7 @@ fi
|
||||
rm -fr out
|
||||
mkdir out
|
||||
|
||||
../../xml2txt $ds
|
||||
xml2txt $ds
|
||||
|
||||
rm -f $corpus.lines
|
||||
for i in out/*.txt
|
||||
@@ -53,7 +58,7 @@ cd out
|
||||
mkdir xml
|
||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||
|
||||
../../../metadata
|
||||
metadata
|
||||
|
||||
cd xml
|
||||
rm -f $corpus.data.dz $corpus.index
|
||||
|
||||
@@ -95,7 +95,7 @@ func main() {
|
||||
}
|
||||
p(err)
|
||||
year, week := t.ISOWeek()
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/BuurtAdam/%d/%02d", year, week)
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/BuurtAdam/%d/w%02d", year, week)
|
||||
if exists(dirname + "/lock") {
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -2,17 +2,20 @@
|
||||
|
||||
set -e
|
||||
|
||||
BASE=/net/corpora/nlnieuws
|
||||
PART=$BASE/BuurtAdam
|
||||
|
||||
unset CDPATH
|
||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
||||
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
|
||||
export TZ=Europe/Amsterdam
|
||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||
|
||||
if [ "$1" = "" ]
|
||||
then
|
||||
ds=`date -d -7days +%G-%V`
|
||||
ds=`date -d -7days +%G.%V`
|
||||
else
|
||||
case "$1" in
|
||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
||||
2[0-9][0-9][0-9].[0-5][0-9])
|
||||
ds=$1
|
||||
;;
|
||||
*)
|
||||
@@ -22,11 +25,13 @@ else
|
||||
esac
|
||||
fi
|
||||
|
||||
dp=${ds//-//}
|
||||
year=${ds%.*}
|
||||
week=${ds#*.}
|
||||
dp=$year/w$week
|
||||
corpus=$PART/corpus/$year/$ds
|
||||
mkdir -p $PART/corpus/$year
|
||||
|
||||
corpus=/net/corpora/nlnieuws/BuurtAdam/corpus/$ds
|
||||
|
||||
cd /net/corpora/nlnieuws/BuurtAdam/$dp
|
||||
cd $PART/$dp
|
||||
|
||||
ln -s lock.$$ lock
|
||||
if [ "`readlink lock`" != lock.$$ ]
|
||||
@@ -51,14 +56,14 @@ cd out
|
||||
mkdir xml
|
||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||
|
||||
../../../metadata
|
||||
metadata
|
||||
|
||||
cd xml
|
||||
rm -f $corpus.data.dz $corpus.index
|
||||
alto -q -o $corpus.data.dz *.xml
|
||||
|
||||
# telling per bericht, niet per zin
|
||||
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
|
||||
cd ../..
|
||||
rm -fr out
|
||||
|
||||
@@ -94,7 +94,7 @@ func main() {
|
||||
}
|
||||
p(err)
|
||||
year, week := t.ISOWeek()
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/BuurtGrn/%d/%02d", year, week)
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/BuurtGrn/%d/w%02d", year, week)
|
||||
if exists(dirname + "/lock") {
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -2,17 +2,20 @@
|
||||
|
||||
set -e
|
||||
|
||||
BASE=/net/corpora/nlnieuws
|
||||
PART=$BASE/BuurtGrn
|
||||
|
||||
unset CDPATH
|
||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
||||
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
|
||||
export TZ=Europe/Amsterdam
|
||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||
|
||||
if [ "$1" = "" ]
|
||||
then
|
||||
ds=`date -d -7days +%G-%V`
|
||||
ds=`date -d -7days +%G.%V`
|
||||
else
|
||||
case "$1" in
|
||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
||||
2[0-9][0-9][0-9].[0-5][0-9])
|
||||
ds=$1
|
||||
;;
|
||||
*)
|
||||
@@ -22,11 +25,13 @@ else
|
||||
esac
|
||||
fi
|
||||
|
||||
dp=${ds//-//}
|
||||
year=${ds%.*}
|
||||
week=${ds#*.}
|
||||
dp=$year/w$week
|
||||
corpus=$PART/corpus/$year/$ds
|
||||
mkdir -p $PART/corpus/$year
|
||||
|
||||
corpus=/net/corpora/nlnieuws/BuurtGrn/corpus/$ds
|
||||
|
||||
cd /net/corpora/nlnieuws/BuurtGrn/$dp
|
||||
cd $PART/$dp
|
||||
|
||||
ln -s lock.$$ lock
|
||||
if [ "`readlink lock`" != lock.$$ ]
|
||||
@@ -51,14 +56,14 @@ cd out
|
||||
mkdir xml
|
||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||
|
||||
../../../metadata
|
||||
metadata
|
||||
|
||||
cd xml
|
||||
rm -f $corpus.data.dz $corpus.index
|
||||
alto -q -o $corpus.data.dz *.xml
|
||||
|
||||
# telling per bericht, niet per zin
|
||||
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
|
||||
cd ../..
|
||||
rm -fr out
|
||||
|
||||
@@ -95,7 +95,7 @@ func main() {
|
||||
}
|
||||
p(err)
|
||||
year, week := t.ISOWeek()
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/GG/%d/%02d", year, week)
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/GG/%d/w%02d", year, week)
|
||||
if exists(dirname + "/lock") {
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -2,17 +2,20 @@
|
||||
|
||||
set -e
|
||||
|
||||
BASE=/net/corpora/nlnieuws
|
||||
PART=$BASE/GG
|
||||
|
||||
unset CDPATH
|
||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
||||
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
|
||||
export TZ=Europe/Amsterdam
|
||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||
|
||||
if [ "$1" = "" ]
|
||||
then
|
||||
ds=`date -d -7days +%G-%V`
|
||||
ds=`date -d -7days +%G.%V`
|
||||
else
|
||||
case "$1" in
|
||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
||||
2[0-9][0-9][0-9].[0-5][0-9])
|
||||
ds=$1
|
||||
;;
|
||||
*)
|
||||
@@ -22,11 +25,13 @@ else
|
||||
esac
|
||||
fi
|
||||
|
||||
dp=${ds//-//}
|
||||
year=${ds%.*}
|
||||
week=${ds#*.}
|
||||
dp=$year/w$week
|
||||
corpus=$PART/corpus/$year/$ds
|
||||
mkdir -p $PART/corpus/$year
|
||||
|
||||
corpus=/net/corpora/nlnieuws/GG/corpus/$ds
|
||||
|
||||
cd /net/corpora/nlnieuws/GG/$dp
|
||||
cd $PART/$dp
|
||||
|
||||
ln -s lock.$$ lock
|
||||
if [ "`readlink lock`" != lock.$$ ]
|
||||
@@ -51,7 +56,7 @@ cd out
|
||||
mkdir xml
|
||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||
|
||||
../../../metadata
|
||||
metadata
|
||||
|
||||
cd xml
|
||||
rm -f $corpus.data.dz $corpus.index
|
||||
|
||||
@@ -101,8 +101,7 @@ func main() {
|
||||
t, err = time.Parse(time.RFC1123, item.PubDate)
|
||||
}
|
||||
p(err)
|
||||
year, week := t.ISOWeek()
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/HLN/%d/%02d", year, week)
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/HLN/%d/%02d/%02d", t.Year(), int(t.Month()), t.Day())
|
||||
if exists(dirname + "/lock") {
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -2,17 +2,20 @@
|
||||
|
||||
set -e
|
||||
|
||||
BASE=/net/corpora/nlnieuws
|
||||
PART=$BASE/HLN
|
||||
|
||||
unset CDPATH
|
||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
||||
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
|
||||
export TZ=Europe/Amsterdam
|
||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||
|
||||
if [ "$1" = "" ]
|
||||
then
|
||||
ds=`date -d -7days +%G-%V`
|
||||
ds=`date -d -2days +%Y-%m-%d`
|
||||
else
|
||||
case "$1" in
|
||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
||||
2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9])
|
||||
ds=$1
|
||||
;;
|
||||
*)
|
||||
@@ -23,10 +26,11 @@ else
|
||||
fi
|
||||
|
||||
dp=${ds//-//}
|
||||
year=${ds%%-*}
|
||||
corpus=$PART/corpus/$year/$ds
|
||||
mkdir -p $PART/corpus/$year
|
||||
|
||||
corpus=/net/corpora/nlnieuws/HLN/corpus/$ds
|
||||
|
||||
cd /net/corpora/nlnieuws/HLN/$dp
|
||||
cd $PART/$dp
|
||||
|
||||
ln -s lock.$$ lock
|
||||
if [ "`readlink lock`" != lock.$$ ]
|
||||
@@ -51,14 +55,14 @@ cd out
|
||||
mkdir xml
|
||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||
|
||||
../../../metadata
|
||||
metadata
|
||||
|
||||
cd xml
|
||||
rm -f $corpus.data.dz $corpus.index
|
||||
alto -q -o $corpus.data.dz *.xml
|
||||
|
||||
# telling per bericht, niet per zin
|
||||
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
|
||||
cd ../..
|
||||
rm -fr out
|
||||
|
||||
@@ -77,7 +77,7 @@ func main() {
|
||||
}
|
||||
p(err)
|
||||
year, week := t.ISOWeek()
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/LitNL/%d/%02d", year, week)
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/LitNL/%d/w%02d", year, week)
|
||||
if exists(dirname + "/lock") {
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -24,7 +24,7 @@ var (
|
||||
w = e.WarnErr
|
||||
x = e.ExitErr
|
||||
|
||||
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`)
|
||||
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]\.[0-5][0-9]$`)
|
||||
)
|
||||
|
||||
func main() {
|
||||
@@ -33,16 +33,16 @@ func main() {
|
||||
switch len(os.Args) {
|
||||
case 1:
|
||||
year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
|
||||
ds = fmt.Sprintf("%d-%02d", year, week)
|
||||
ds = fmt.Sprintf("%d.%02d", year, week)
|
||||
case 2:
|
||||
if !reYearWeek.MatchString(os.Args[1]) {
|
||||
x(fmt.Errorf("arg must be yyyy-ww"))
|
||||
x(fmt.Errorf("arg must be yyyy.ww"))
|
||||
}
|
||||
ds = os.Args[1]
|
||||
default:
|
||||
x(fmt.Errorf("too many arguments"))
|
||||
}
|
||||
dp := ds[:4] + "/" + ds[5:]
|
||||
dp := ds[:4] + "/w" + ds[5:]
|
||||
|
||||
x(os.Chdir("/net/corpora/nlnieuws/LitNL/" + dp))
|
||||
x(os.MkdirAll("out", 0777))
|
||||
|
||||
@@ -2,17 +2,20 @@
|
||||
|
||||
set -e
|
||||
|
||||
BASE=/net/corpora/nlnieuws
|
||||
PART=$BASE/LitNL
|
||||
|
||||
unset CDPATH
|
||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
||||
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
|
||||
export TZ=Europe/Amsterdam
|
||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||
|
||||
if [ "$1" = "" ]
|
||||
then
|
||||
ds=`date -d -7days +%G-%V`
|
||||
ds=`date -d -7days +%G.%V`
|
||||
else
|
||||
case "$1" in
|
||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
||||
2[0-9][0-9][0-9].[0-5][0-9])
|
||||
ds=$1
|
||||
;;
|
||||
*)
|
||||
@@ -22,11 +25,13 @@ else
|
||||
esac
|
||||
fi
|
||||
|
||||
dp=${ds//-//}
|
||||
year=${ds%.*}
|
||||
week=${ds#*.}
|
||||
dp=$year/w$week
|
||||
corpus=$PART/corpus/$year/$ds
|
||||
mkdir -p $PART/corpus/$year
|
||||
|
||||
corpus=/net/corpora/nlnieuws/LitNL/corpus/$ds
|
||||
|
||||
cd /net/corpora/nlnieuws/LitNL/$dp
|
||||
cd $PART/$dp
|
||||
|
||||
ln -s lock.$$ lock
|
||||
if [ "`readlink lock`" != lock.$$ ]
|
||||
@@ -38,7 +43,7 @@ fi
|
||||
rm -fr out
|
||||
mkdir out
|
||||
|
||||
../../xml2txt $ds
|
||||
xml2txt $ds
|
||||
|
||||
rm -f $corpus.lines
|
||||
for i in out/*.txt
|
||||
@@ -53,14 +58,14 @@ cd out
|
||||
mkdir xml
|
||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||
|
||||
../../../metadata
|
||||
metadata
|
||||
|
||||
cd xml
|
||||
rm -f $corpus.data.dz $corpus.index
|
||||
alto -q -o $corpus.data.dz *.xml
|
||||
|
||||
# telling per bericht, niet per zin
|
||||
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
|
||||
cd ../..
|
||||
rm -fr out
|
||||
|
||||
@@ -28,7 +28,7 @@ type Item struct {
|
||||
var (
|
||||
x = e.ExitErr
|
||||
|
||||
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`)
|
||||
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9]$`)
|
||||
)
|
||||
|
||||
func main() {
|
||||
@@ -36,17 +36,17 @@ func main() {
|
||||
var ds string
|
||||
switch len(os.Args) {
|
||||
case 1:
|
||||
year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
|
||||
ds = fmt.Sprintf("%d-%02d", year, week)
|
||||
t := time.Now().AddDate(0, 0, -2)
|
||||
ds = fmt.Sprintf("%d-%02d-%02d", t.Year(), int(t.Month()), t.Day())
|
||||
case 2:
|
||||
if !reYearWeek.MatchString(os.Args[1]) {
|
||||
x(fmt.Errorf("arg must be yyyy-ww"))
|
||||
x(fmt.Errorf("arg must be yyyy-mm-dd"))
|
||||
}
|
||||
ds = os.Args[1]
|
||||
default:
|
||||
x(fmt.Errorf("too many arguments"))
|
||||
}
|
||||
dp := ds[:4] + "/" + ds[5:]
|
||||
dp := strings.ReplaceAll(ds, "-", "/")
|
||||
|
||||
x(os.Chdir("/net/corpora/nlnieuws/NOS/" + dp))
|
||||
x(os.MkdirAll("out", 0777))
|
||||
|
||||
@@ -94,8 +94,7 @@ func main() {
|
||||
}
|
||||
}
|
||||
p(err)
|
||||
year, week := t.ISOWeek()
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/NOS/%d/%02d", year, week)
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/NOS/%d/%02d/%02d", t.Year(), int(t.Month()), t.Day())
|
||||
if exists(dirname + "/lock") {
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -2,17 +2,20 @@
|
||||
|
||||
set -e
|
||||
|
||||
BASE=/net/corpora/nlnieuws
|
||||
PART=$BASE/NOS
|
||||
|
||||
unset CDPATH
|
||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
||||
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
|
||||
export TZ=Europe/Amsterdam
|
||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||
|
||||
if [ "$1" = "" ]
|
||||
then
|
||||
ds=`date -d -7days +%G-%V`
|
||||
ds=`date -d -2days +%Y-%m-%d`
|
||||
else
|
||||
case "$1" in
|
||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
||||
2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9])
|
||||
ds=$1
|
||||
;;
|
||||
*)
|
||||
@@ -23,10 +26,11 @@ else
|
||||
fi
|
||||
|
||||
dp=${ds//-//}
|
||||
year=${ds%%-*}
|
||||
corpus=$PART/corpus/$year/$ds
|
||||
mkdir -p $PART/corpus/$year
|
||||
|
||||
corpus=/net/corpora/nlnieuws/NOS/corpus/$ds
|
||||
|
||||
cd /net/corpora/nlnieuws/NOS/$dp
|
||||
cd $PART/$dp
|
||||
|
||||
ln -s lock.$$ lock
|
||||
if [ "`readlink lock`" != lock.$$ ]
|
||||
@@ -38,7 +42,7 @@ fi
|
||||
rm -fr out
|
||||
mkdir out
|
||||
|
||||
../../json2txt $ds
|
||||
json2txt $ds
|
||||
|
||||
rm -f $corpus.lines
|
||||
for i in out/*.txt
|
||||
@@ -53,15 +57,15 @@ cd out
|
||||
mkdir xml
|
||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||
|
||||
../../../metadata
|
||||
metadata
|
||||
|
||||
cd xml
|
||||
rm -f $corpus.data.dz $corpus.index
|
||||
alto -q -o $corpus.data.dz *.xml
|
||||
|
||||
# telling per bericht, niet per zin
|
||||
/net/corpora/nlnieuws/namen.sh -x C -s $corpus.data.dz > $corpus.cat.txt
|
||||
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
query.sh -x C -s $corpus.data.dz > $corpus.cat.txt
|
||||
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
|
||||
cd ../..
|
||||
rm -fr out
|
||||
|
||||
@@ -98,8 +98,7 @@ func main() {
|
||||
t, err = time.Parse(time.RFC1123, item.PubDate)
|
||||
}
|
||||
p(err)
|
||||
year, week := t.ISOWeek()
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/NU/%d/%02d", year, week)
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/NU/%d/%02d/%02d", t.Year(), int(t.Month()), t.Day())
|
||||
if exists(dirname + "/lock") {
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -2,17 +2,20 @@
|
||||
|
||||
set -e
|
||||
|
||||
BASE=/net/corpora/nlnieuws
|
||||
PART=$BASE/NU
|
||||
|
||||
unset CDPATH
|
||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
||||
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
|
||||
export TZ=Europe/Amsterdam
|
||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||
|
||||
if [ "$1" = "" ]
|
||||
then
|
||||
ds=`date -d -7days +%G-%V`
|
||||
ds=`date -d -2days +%Y-%m-%d`
|
||||
else
|
||||
case "$1" in
|
||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
||||
2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9])
|
||||
ds=$1
|
||||
;;
|
||||
*)
|
||||
@@ -23,10 +26,11 @@ else
|
||||
fi
|
||||
|
||||
dp=${ds//-//}
|
||||
year=${ds%%-*}
|
||||
corpus=$PART/corpus/$year/$ds
|
||||
mkdir -p $PART/corpus/$year
|
||||
|
||||
corpus=/net/corpora/nlnieuws/NU/corpus/$ds
|
||||
|
||||
cd /net/corpora/nlnieuws/NU/$dp
|
||||
cd $PART/$dp
|
||||
|
||||
ln -s lock.$$ lock
|
||||
if [ "`readlink lock`" != lock.$$ ]
|
||||
@@ -51,14 +55,14 @@ cd out
|
||||
mkdir xml
|
||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||
|
||||
../../../metadata
|
||||
metadata
|
||||
|
||||
cd xml
|
||||
rm -f $corpus.data.dz $corpus.index
|
||||
alto -q -o $corpus.data.dz *.xml
|
||||
|
||||
# telling per bericht, niet per zin
|
||||
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
|
||||
cd ../..
|
||||
rm -fr out
|
||||
|
||||
@@ -2,8 +2,11 @@
|
||||
|
||||
set -e
|
||||
|
||||
BASE=/net/corpora/nlnieuws
|
||||
PART=$BASE/NieuwsNL
|
||||
|
||||
unset CDPATH
|
||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
||||
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
|
||||
export TZ=Europe/Amsterdam
|
||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||
|
||||
@@ -25,10 +28,11 @@ else
|
||||
fi
|
||||
|
||||
dp=${ds//-//}
|
||||
year=${ds%%-*}
|
||||
corpus=$PART/corpus/$year/$ds
|
||||
mkdir -p $PART/corpus/$year
|
||||
|
||||
corpus=/net/corpora/nlnieuws/NieuwsNL/corpus/$ds
|
||||
|
||||
cd /net/corpora/nlnieuws/NieuwsNL/$dp
|
||||
cd $PART/$dp
|
||||
|
||||
ln -s lock.$$ lock
|
||||
if [ "`readlink lock`" != lock.$$ ]
|
||||
@@ -53,14 +57,14 @@ cd out
|
||||
mkdir xml
|
||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||
|
||||
../../../../metadata
|
||||
metadata
|
||||
|
||||
cd xml
|
||||
rm -f $corpus.data.dz $corpus.index
|
||||
alto -q -o $corpus.data.dz *.xml
|
||||
|
||||
# telling per bericht, niet per zin
|
||||
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
|
||||
cd ../..
|
||||
rm -fr out
|
||||
|
||||
@@ -77,7 +77,7 @@ func main() {
|
||||
}
|
||||
p(err)
|
||||
year, week := t.ISOWeek()
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Oog/%d/%02d", year, week)
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Oog/%d/w%02d", year, week)
|
||||
if exists(dirname + "/lock") {
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -23,7 +23,7 @@ type Item struct {
|
||||
var (
|
||||
x = e.ExitErr
|
||||
|
||||
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`)
|
||||
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]\.[0-5][0-9]$`)
|
||||
)
|
||||
|
||||
func main() {
|
||||
@@ -32,16 +32,16 @@ func main() {
|
||||
switch len(os.Args) {
|
||||
case 1:
|
||||
year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
|
||||
ds = fmt.Sprintf("%d-%02d", year, week)
|
||||
ds = fmt.Sprintf("%d.%02d", year, week)
|
||||
case 2:
|
||||
if !reYearWeek.MatchString(os.Args[1]) {
|
||||
x(fmt.Errorf("arg must be yyyy-ww"))
|
||||
x(fmt.Errorf("arg must be yyyy.ww"))
|
||||
}
|
||||
ds = os.Args[1]
|
||||
default:
|
||||
x(fmt.Errorf("too many arguments"))
|
||||
}
|
||||
dp := ds[:4] + "/" + ds[5:]
|
||||
dp := ds[:4] + "/w" + ds[5:]
|
||||
|
||||
x(os.Chdir("/net/corpora/nlnieuws/Oog/" + dp))
|
||||
x(os.MkdirAll("out", 0777))
|
||||
|
||||
@@ -2,17 +2,20 @@
|
||||
|
||||
set -e
|
||||
|
||||
BASE=/net/corpora/nlnieuws
|
||||
PART=$BASE/Oog
|
||||
|
||||
unset CDPATH
|
||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
||||
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
|
||||
export TZ=Europe/Amsterdam
|
||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||
|
||||
if [ "$1" = "" ]
|
||||
then
|
||||
ds=`date -d -7days +%G-%V`
|
||||
ds=`date -d -7days +%G.%V`
|
||||
else
|
||||
case "$1" in
|
||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
||||
2[0-9][0-9][0-9].[0-5][0-9])
|
||||
ds=$1
|
||||
;;
|
||||
*)
|
||||
@@ -22,11 +25,13 @@ else
|
||||
esac
|
||||
fi
|
||||
|
||||
dp=${ds//-//}
|
||||
year=${ds%.*}
|
||||
week=${ds#*.}
|
||||
dp=$year/w$week
|
||||
corpus=$PART/corpus/$year/$ds
|
||||
mkdir -p $PART/corpus/$year
|
||||
|
||||
corpus=/net/corpora/nlnieuws/Oog/corpus/$ds
|
||||
|
||||
cd /net/corpora/nlnieuws/Oog/$dp
|
||||
cd $PART/$dp
|
||||
|
||||
ln -s lock.$$ lock
|
||||
if [ "`readlink lock`" != lock.$$ ]
|
||||
@@ -38,7 +43,7 @@ fi
|
||||
rm -fr out
|
||||
mkdir out
|
||||
|
||||
../../xml2txt $ds
|
||||
xml2txt $ds
|
||||
|
||||
rm -f $corpus.lines
|
||||
for i in out/*.txt
|
||||
@@ -53,14 +58,14 @@ cd out
|
||||
mkdir xml
|
||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||
|
||||
../../../metadata
|
||||
metadata
|
||||
|
||||
cd xml
|
||||
rm -f $corpus.data.dz $corpus.index
|
||||
alto -q -o $corpus.data.dz *.xml
|
||||
|
||||
# telling per bericht, niet per zin
|
||||
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
|
||||
cd ../..
|
||||
rm -fr out
|
||||
|
||||
@@ -102,7 +102,7 @@ func main() {
|
||||
}
|
||||
p(err)
|
||||
year, week := t.ISOWeek()
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Parool/%d/%02d", year, week)
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Parool/%d/w%02d", year, week)
|
||||
if exists(dirname + "/lock") {
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -2,17 +2,20 @@
|
||||
|
||||
set -e
|
||||
|
||||
BASE=/net/corpora/nlnieuws
|
||||
PART=$BASE/Parool
|
||||
|
||||
unset CDPATH
|
||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
||||
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
|
||||
export TZ=Europe/Amsterdam
|
||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||
|
||||
if [ "$1" = "" ]
|
||||
then
|
||||
ds=`date -d -7days +%G-%V`
|
||||
ds=`date -d -7days +%G.%V`
|
||||
else
|
||||
case "$1" in
|
||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
||||
2[0-9][0-9][0-9].[0-5][0-9])
|
||||
ds=$1
|
||||
;;
|
||||
*)
|
||||
@@ -22,11 +25,13 @@ else
|
||||
esac
|
||||
fi
|
||||
|
||||
dp=${ds//-//}
|
||||
year=${ds%.*}
|
||||
week=${ds#*.}
|
||||
dp=$year/w$week
|
||||
corpus=$PART/corpus/$year/$ds
|
||||
mkdir -p $PART/corpus/$year
|
||||
|
||||
corpus=/net/corpora/nlnieuws/Parool/corpus/$ds
|
||||
|
||||
cd /net/corpora/nlnieuws/Parool/$dp
|
||||
cd $PART/$dp
|
||||
|
||||
ln -s lock.$$ lock
|
||||
if [ "`readlink lock`" != lock.$$ ]
|
||||
@@ -51,14 +56,14 @@ cd out
|
||||
mkdir xml
|
||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||
|
||||
../../../metadata
|
||||
metadata
|
||||
|
||||
cd xml
|
||||
rm -f $corpus.data.dz $corpus.index
|
||||
alto -q -o $corpus.data.dz *.xml
|
||||
|
||||
# telling per bericht, niet per zin
|
||||
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
|
||||
cd ../..
|
||||
rm -fr out
|
||||
|
||||
@@ -77,7 +77,7 @@ func main() {
|
||||
}
|
||||
p(err)
|
||||
year, week := t.ISOWeek()
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/RO/%d/%02d", year, week)
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/RO/%d/w%02d", year, week)
|
||||
if exists(dirname + "/lock") {
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -26,7 +26,7 @@ var (
|
||||
x = e.ExitErr
|
||||
w = e.WarnErr
|
||||
|
||||
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`)
|
||||
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]\.[0-5][0-9]$`)
|
||||
)
|
||||
|
||||
func main() {
|
||||
@@ -38,16 +38,16 @@ func main() {
|
||||
switch len(os.Args) {
|
||||
case 1:
|
||||
year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
|
||||
ds = fmt.Sprintf("%d-%02d", year, week)
|
||||
ds = fmt.Sprintf("%d.%02d", year, week)
|
||||
case 2:
|
||||
if !reYearWeek.MatchString(os.Args[1]) {
|
||||
x(fmt.Errorf("arg must be yyyy-ww"))
|
||||
x(fmt.Errorf("arg must be yyyy.ww"))
|
||||
}
|
||||
ds = os.Args[1]
|
||||
default:
|
||||
x(fmt.Errorf("too many arguments"))
|
||||
}
|
||||
dp := ds[:4] + "/" + ds[5:]
|
||||
dp := ds[:4] + "/w" + ds[5:]
|
||||
|
||||
x(os.Chdir("/net/corpora/nlnieuws/RO/" + dp))
|
||||
x(os.MkdirAll("out", 0777))
|
||||
|
||||
@@ -2,17 +2,20 @@
|
||||
|
||||
set -e
|
||||
|
||||
BASE=/net/corpora/nlnieuws
|
||||
PART=$BASE/RO
|
||||
|
||||
unset CDPATH
|
||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
||||
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
|
||||
export TZ=Europe/Amsterdam
|
||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||
|
||||
if [ "$1" = "" ]
|
||||
then
|
||||
ds=`date -d -7days +%G-%V`
|
||||
ds=`date -d -7days +%G.%V`
|
||||
else
|
||||
case "$1" in
|
||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
||||
2[0-9][0-9][0-9].[0-5][0-9])
|
||||
ds=$1
|
||||
;;
|
||||
*)
|
||||
@@ -22,11 +25,13 @@ else
|
||||
esac
|
||||
fi
|
||||
|
||||
dp=${ds//-//}
|
||||
year=${ds%.*}
|
||||
week=${ds#*.}
|
||||
dp=$year/w$week
|
||||
corpus=$PART/corpus/$year/$ds
|
||||
mkdir -p $PART/corpus/$year
|
||||
|
||||
corpus=/net/corpora/nlnieuws/RO/corpus/$ds
|
||||
|
||||
cd /net/corpora/nlnieuws/RO/$dp
|
||||
cd $PART/$dp
|
||||
|
||||
ln -s lock.$$ lock
|
||||
if [ "`readlink lock`" != lock.$$ ]
|
||||
@@ -38,7 +43,7 @@ fi
|
||||
rm -fr out
|
||||
mkdir out
|
||||
|
||||
../../xml2txt $ds
|
||||
xml2txt $ds
|
||||
|
||||
rm -f $corpus.lines
|
||||
for i in out/*.txt
|
||||
@@ -53,14 +58,14 @@ cd out
|
||||
mkdir xml
|
||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||
|
||||
../../../metadata
|
||||
metadata
|
||||
|
||||
cd xml
|
||||
rm -f $corpus.data.dz $corpus.index
|
||||
alto -q -o $corpus.data.dz *.xml
|
||||
|
||||
# telling per bericht, niet per zin
|
||||
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
|
||||
cd ../..
|
||||
rm -fr out
|
||||
|
||||
@@ -101,7 +101,7 @@ func main() {
|
||||
}
|
||||
p(err)
|
||||
year, week := t.ISOWeek()
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/RTVNoord/%d/%02d", year, week)
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/RTVNoord/%d/w%02d", year, week)
|
||||
if exists(dirname + "/lock") {
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -2,17 +2,20 @@
|
||||
|
||||
set -e
|
||||
|
||||
BASE=/net/corpora/nlnieuws
|
||||
PART=$BASE/RTVNoord
|
||||
|
||||
unset CDPATH
|
||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
||||
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
|
||||
export TZ=Europe/Amsterdam
|
||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||
|
||||
if [ "$1" = "" ]
|
||||
then
|
||||
ds=`date -d -7days +%G-%V`
|
||||
ds=`date -d -7days +%G.%V`
|
||||
else
|
||||
case "$1" in
|
||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
||||
2[0-9][0-9][0-9].[0-5][0-9])
|
||||
ds=$1
|
||||
;;
|
||||
*)
|
||||
@@ -22,11 +25,13 @@ else
|
||||
esac
|
||||
fi
|
||||
|
||||
dp=${ds//-//}
|
||||
year=${ds%.*}
|
||||
week=${ds#*.}
|
||||
dp=$year/w$week
|
||||
corpus=$PART/corpus/$year/$ds
|
||||
mkdir -p $PART/corpus/$year
|
||||
|
||||
corpus=/net/corpora/nlnieuws/RTVNoord/corpus/$ds
|
||||
|
||||
cd /net/corpora/nlnieuws/RTVNoord/$dp
|
||||
cd $PART/$dp
|
||||
|
||||
ln -s lock.$$ lock
|
||||
if [ "`readlink lock`" != lock.$$ ]
|
||||
@@ -51,15 +56,15 @@ cd out
|
||||
mkdir xml
|
||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||
|
||||
../../../metadata
|
||||
metadata
|
||||
|
||||
cd xml
|
||||
rm -f $corpus.data.dz $corpus.index
|
||||
alto -q -o $corpus.data.dz *.xml
|
||||
|
||||
# telling per bericht, niet per zin
|
||||
/net/corpora/nlnieuws/namen.sh -x C -s $corpus.data.dz > $corpus.cat.txt
|
||||
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
query.sh -x C -s $corpus.data.dz > $corpus.cat.txt
|
||||
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
|
||||
cd ../..
|
||||
rm -fr out
|
||||
|
||||
@@ -77,7 +77,7 @@ func main() {
|
||||
}
|
||||
p(err)
|
||||
year, week := t.ISOWeek()
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Sargasso/%d/%02d", year, week)
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Sargasso/%d/w%02d", year, week)
|
||||
if exists(dirname + "/lock") {
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -23,7 +23,7 @@ type Item struct {
|
||||
var (
|
||||
x = e.ExitErr
|
||||
|
||||
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`)
|
||||
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]\.[0-5][0-9]$`)
|
||||
)
|
||||
|
||||
func main() {
|
||||
@@ -32,16 +32,16 @@ func main() {
|
||||
switch len(os.Args) {
|
||||
case 1:
|
||||
year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
|
||||
ds = fmt.Sprintf("%d-%02d", year, week)
|
||||
ds = fmt.Sprintf("%d.%02d", year, week)
|
||||
case 2:
|
||||
if !reYearWeek.MatchString(os.Args[1]) {
|
||||
x(fmt.Errorf("arg must be yyyy-ww"))
|
||||
x(fmt.Errorf("arg must be yyyy.ww"))
|
||||
}
|
||||
ds = os.Args[1]
|
||||
default:
|
||||
x(fmt.Errorf("too many arguments"))
|
||||
}
|
||||
dp := ds[:4] + "/" + ds[5:]
|
||||
dp := ds[:4] + "/w" + ds[5:]
|
||||
|
||||
x(os.Chdir("/net/corpora/nlnieuws/Sargasso/" + dp))
|
||||
x(os.MkdirAll("out", 0777))
|
||||
|
||||
@@ -2,17 +2,20 @@
|
||||
|
||||
set -e
|
||||
|
||||
BASE=/net/corpora/nlnieuws
|
||||
PART=$BASE/Sargasso
|
||||
|
||||
unset CDPATH
|
||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
||||
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
|
||||
export TZ=Europe/Amsterdam
|
||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||
|
||||
if [ "$1" = "" ]
|
||||
then
|
||||
ds=`date -d -7days +%G-%V`
|
||||
ds=`date -d -7days +%G.%V`
|
||||
else
|
||||
case "$1" in
|
||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
||||
2[0-9][0-9][0-9].[0-5][0-9])
|
||||
ds=$1
|
||||
;;
|
||||
*)
|
||||
@@ -22,11 +25,13 @@ else
|
||||
esac
|
||||
fi
|
||||
|
||||
dp=${ds//-//}
|
||||
year=${ds%.*}
|
||||
week=${ds#*.}
|
||||
dp=$year/w$week
|
||||
corpus=$PART/corpus/$year/$ds
|
||||
mkdir -p $PART/corpus/$year
|
||||
|
||||
corpus=/net/corpora/nlnieuws/Sargasso/corpus/$ds
|
||||
|
||||
cd /net/corpora/nlnieuws/Sargasso/$dp
|
||||
cd $PART/$dp
|
||||
|
||||
ln -s lock.$$ lock
|
||||
if [ "`readlink lock`" != lock.$$ ]
|
||||
@@ -38,7 +43,7 @@ fi
|
||||
rm -fr out
|
||||
mkdir out
|
||||
|
||||
../../xml2txt $ds
|
||||
xml2txt $ds
|
||||
|
||||
rm -f $corpus.lines
|
||||
for i in out/*.txt
|
||||
@@ -53,14 +58,14 @@ cd out
|
||||
mkdir xml
|
||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||
|
||||
../../../metadata
|
||||
metadata
|
||||
|
||||
cd xml
|
||||
rm -f $corpus.data.dz $corpus.index
|
||||
alto -q -o $corpus.data.dz *.xml
|
||||
|
||||
# telling per bericht, niet per zin
|
||||
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
|
||||
cd ../..
|
||||
rm -fr out
|
||||
|
||||
@@ -90,7 +90,7 @@ func main() {
|
||||
}
|
||||
p(err)
|
||||
year, week := t.ISOWeek()
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Sikkom/%d/%02d", year, week)
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Sikkom/%d/w%02d", year, week)
|
||||
if exists(dirname + "/lock") {
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -2,17 +2,20 @@
|
||||
|
||||
set -e
|
||||
|
||||
BASE=/net/corpora/nlnieuws
|
||||
PART=$BASE/Sikkom
|
||||
|
||||
unset CDPATH
|
||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
||||
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
|
||||
export TZ=Europe/Amsterdam
|
||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||
|
||||
if [ "$1" = "" ]
|
||||
then
|
||||
ds=`date -d -7days +%G-%V`
|
||||
ds=`date -d -7days +%G.%V`
|
||||
else
|
||||
case "$1" in
|
||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
||||
2[0-9][0-9][0-9].[0-5][0-9])
|
||||
ds=$1
|
||||
;;
|
||||
*)
|
||||
@@ -22,11 +25,13 @@ else
|
||||
esac
|
||||
fi
|
||||
|
||||
dp=${ds//-//}
|
||||
year=${ds%.*}
|
||||
week=${ds#*.}
|
||||
dp=$year/w$week
|
||||
corpus=$PART/corpus/$year/$ds
|
||||
mkdir -p $PART/corpus/$year
|
||||
|
||||
corpus=/net/corpora/nlnieuws/Sikkom/corpus/$ds
|
||||
|
||||
cd /net/corpora/nlnieuws/Sikkom/$dp
|
||||
cd $PART/$dp
|
||||
|
||||
ln -s lock.$$ lock
|
||||
if [ "`readlink lock`" != lock.$$ ]
|
||||
@@ -51,7 +56,7 @@ cd out
|
||||
mkdir xml
|
||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||
|
||||
../../../metadata
|
||||
metadata
|
||||
|
||||
cd xml
|
||||
rm -f $corpus.data.dz $corpus.index
|
||||
|
||||
@@ -77,7 +77,7 @@ func main() {
|
||||
}
|
||||
p(err)
|
||||
year, week := t.ISOWeek()
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Tzum/%d/%02d", year, week)
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Tzum/%d/w%02d", year, week)
|
||||
if exists(dirname + "/lock") {
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -23,7 +23,7 @@ type Item struct {
|
||||
var (
|
||||
x = e.ExitErr
|
||||
|
||||
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`)
|
||||
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]\.[0-5][0-9]$`)
|
||||
)
|
||||
|
||||
func main() {
|
||||
@@ -32,16 +32,16 @@ func main() {
|
||||
switch len(os.Args) {
|
||||
case 1:
|
||||
year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
|
||||
ds = fmt.Sprintf("%d-%02d", year, week)
|
||||
ds = fmt.Sprintf("%d.%02d", year, week)
|
||||
case 2:
|
||||
if !reYearWeek.MatchString(os.Args[1]) {
|
||||
x(fmt.Errorf("arg must be yyyy-ww"))
|
||||
x(fmt.Errorf("arg must be yyyy.ww"))
|
||||
}
|
||||
ds = os.Args[1]
|
||||
default:
|
||||
x(fmt.Errorf("too many arguments"))
|
||||
}
|
||||
dp := ds[:4] + "/" + ds[5:]
|
||||
dp := ds[:4] + "/w" + ds[5:]
|
||||
|
||||
x(os.Chdir("/net/corpora/nlnieuws/Tzum/" + dp))
|
||||
x(os.MkdirAll("out", 0777))
|
||||
|
||||
@@ -2,17 +2,20 @@
|
||||
|
||||
set -e
|
||||
|
||||
BASE=/net/corpora/nlnieuws
|
||||
PART=$BASE/Tzum
|
||||
|
||||
unset CDPATH
|
||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
||||
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
|
||||
export TZ=Europe/Amsterdam
|
||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||
|
||||
if [ "$1" = "" ]
|
||||
then
|
||||
ds=`date -d -7days +%G-%V`
|
||||
ds=`date -d -7days +%G.%V`
|
||||
else
|
||||
case "$1" in
|
||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
||||
2[0-9][0-9][0-9].[0-5][0-9])
|
||||
ds=$1
|
||||
;;
|
||||
*)
|
||||
@@ -22,11 +25,13 @@ else
|
||||
esac
|
||||
fi
|
||||
|
||||
dp=${ds//-//}
|
||||
year=${ds%.*}
|
||||
week=${ds#*.}
|
||||
dp=$year/w$week
|
||||
corpus=$PART/corpus/$year/$ds
|
||||
mkdir -p $PART/corpus/$year
|
||||
|
||||
corpus=/net/corpora/nlnieuws/Tzum/corpus/$ds
|
||||
|
||||
cd /net/corpora/nlnieuws/Tzum/$dp
|
||||
cd $PART/$dp
|
||||
|
||||
ln -s lock.$$ lock
|
||||
if [ "`readlink lock`" != lock.$$ ]
|
||||
@@ -38,7 +43,7 @@ fi
|
||||
rm -fr out
|
||||
mkdir out
|
||||
|
||||
../../xml2txt $ds
|
||||
xml2txt $ds
|
||||
|
||||
rm -f $corpus.lines
|
||||
for i in out/*.txt
|
||||
@@ -53,14 +58,14 @@ cd out
|
||||
mkdir xml
|
||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||
|
||||
../../../metadata
|
||||
metadata
|
||||
|
||||
cd xml
|
||||
rm -f $corpus.data.dz $corpus.index
|
||||
alto -q -o $corpus.data.dz *.xml
|
||||
|
||||
# telling per bericht, niet per zin
|
||||
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
|
||||
cd ../..
|
||||
rm -fr out
|
||||
|
||||
@@ -110,8 +110,7 @@ func main() {
|
||||
if t2.After(t) {
|
||||
t = t2
|
||||
}
|
||||
year, week := t.ISOWeek()
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/VRT/%d/%02d", year, week)
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/VRT/%d/%02d/%02d", t.Year(), int(t.Month()), t.Day())
|
||||
if exists(dirname + "/lock") {
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -2,17 +2,20 @@
|
||||
|
||||
set -e
|
||||
|
||||
BASE=/net/corpora/nlnieuws
|
||||
PART=$BASE/VRT
|
||||
|
||||
unset CDPATH
|
||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
||||
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
|
||||
export TZ=Europe/Amsterdam
|
||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||
|
||||
if [ "$1" = "" ]
|
||||
then
|
||||
ds=`date -d -7days +%G-%V`
|
||||
ds=`date -d -2days +%Y-%m-%d`
|
||||
else
|
||||
case "$1" in
|
||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
||||
2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9])
|
||||
ds=$1
|
||||
;;
|
||||
*)
|
||||
@@ -23,10 +26,11 @@ else
|
||||
fi
|
||||
|
||||
dp=${ds//-//}
|
||||
year=${ds%%-*}
|
||||
corpus=$PART/corpus/$year/$ds
|
||||
mkdir -p $PART/corpus/$year
|
||||
|
||||
corpus=/net/corpora/nlnieuws/VRT/corpus/$ds
|
||||
|
||||
cd /net/corpora/nlnieuws/VRT/$dp
|
||||
cd $PART/$dp
|
||||
|
||||
ln -s lock.$$ lock
|
||||
if [ "`readlink lock`" != lock.$$ ]
|
||||
@@ -51,15 +55,15 @@ cd out
|
||||
mkdir xml
|
||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||
|
||||
../../../metadata
|
||||
metadata
|
||||
|
||||
cd xml
|
||||
rm -f $corpus.data.dz $corpus.index
|
||||
alto -q -o $corpus.data.dz *.xml
|
||||
|
||||
# telling per bericht, niet per zin
|
||||
/net/corpora/nlnieuws/namen.sh -x C -s $corpus.data.dz > $corpus.cat.txt
|
||||
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
query.sh -x C -s $corpus.data.dz > $corpus.cat.txt
|
||||
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
|
||||
cd ../..
|
||||
rm -fr out
|
||||
|
||||
@@ -70,9 +70,9 @@ var (
|
||||
|
||||
func main() {
|
||||
|
||||
aa := strings.Split(os.Args[1], "-")
|
||||
aa := strings.Split(os.Args[1], ".")
|
||||
if len(aa) != 2 {
|
||||
x(fmt.Errorf("ongeldig argument, moet in formaat yyyy-dd zijn"))
|
||||
x(fmt.Errorf("ongeldig argument, moet in formaat yyyy.dd zijn"))
|
||||
}
|
||||
|
||||
var err error
|
||||
@@ -128,7 +128,8 @@ func makeParts(source string) *Parts {
|
||||
func makeValues(source, part string) [][5]any {
|
||||
v := make([][5]any, 0)
|
||||
|
||||
filename := fmt.Sprintf("/net/corpora/nlnieuws/data/%s-%s-%d-%02d-%d%s",
|
||||
filename := fmt.Sprintf("/net/corpora/nlnieuws/data/%d/%s-%s-%d-%02d-%d%s",
|
||||
year,
|
||||
sources[source],
|
||||
parts[part].file,
|
||||
year,
|
||||
@@ -226,7 +227,7 @@ func dates() (start, first, last string, names []string) {
|
||||
t3 := tStart
|
||||
for range size {
|
||||
y, w := t3.ISOWeek()
|
||||
names = append(names, fmt.Sprintf("%d/%02d", y, w))
|
||||
names = append(names, fmt.Sprintf("%d/w%02d", y, w))
|
||||
t3 = t3.AddDate(0, 0, 7)
|
||||
}
|
||||
t3 = tStart
|
||||
|
||||
@@ -26,13 +26,24 @@ var (
|
||||
|
||||
func main() {
|
||||
|
||||
files, err := os.ReadDir("/net/corpora/nlnieuws/data/json")
|
||||
dirs, err := os.ReadDir("/net/corpora/nlnieuws/data/json")
|
||||
x(err)
|
||||
for _, dir := range dirs {
|
||||
if !dir.IsDir() {
|
||||
continue
|
||||
}
|
||||
dirname := dir.Name()
|
||||
if dirname[0] != '2' {
|
||||
continue
|
||||
}
|
||||
files, err := os.ReadDir("/net/corpora/nlnieuws/data/json/" + dirname)
|
||||
x(err)
|
||||
|
||||
for _, file := range files {
|
||||
filename := file.Name()
|
||||
if strings.HasPrefix(filename, "DATA-") && strings.HasSuffix(filename, "-4.json") {
|
||||
addWeek(filename[5:12])
|
||||
for _, file := range files {
|
||||
filename := file.Name()
|
||||
if strings.HasPrefix(filename, "DATA-") && strings.HasSuffix(filename, "-4.json") {
|
||||
addWeek(filename[5:12])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -15,9 +15,9 @@ var (
|
||||
)
|
||||
|
||||
func main() {
|
||||
aa := strings.Split(os.Args[1], "-")
|
||||
aa := strings.Split(os.Args[1], ".")
|
||||
if len(aa) != 2 {
|
||||
x(fmt.Errorf("ongeldig argument, moet in formaat yyyy-dd zijn"))
|
||||
x(fmt.Errorf("ongeldig argument, moet in formaat yyyy.dd zijn"))
|
||||
}
|
||||
|
||||
year, err := strconv.Atoi(aa[0])
|
||||
@@ -58,7 +58,7 @@ func main() {
|
||||
fmt.Print(" -or")
|
||||
}
|
||||
y, w := t2.ISOWeek()
|
||||
fmt.Printf(" -name %d-%02d.data.dz", y, w)
|
||||
fmt.Printf(" -name %d.%02d.data.dz", y, w)
|
||||
t2 = t2.AddDate(0, 0, 7)
|
||||
}
|
||||
|
||||
|
||||
20
collect.sh
20
collect.sh
@@ -22,10 +22,10 @@ say () {
|
||||
|
||||
if [ "$1" = "" ]
|
||||
then
|
||||
ds=`date -d -7days +%G-%V`
|
||||
ds=`date -d -7days +%G.%V`
|
||||
else
|
||||
case "$1" in
|
||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
||||
2[0-9][0-9][0-9].[0-5][0-9])
|
||||
ds=$1
|
||||
;;
|
||||
*)
|
||||
@@ -35,7 +35,11 @@ else
|
||||
esac
|
||||
fi
|
||||
|
||||
cd /net/corpora/nlnieuws/data
|
||||
year=${ds%%.*}
|
||||
|
||||
mkdir -p /net/corpora/nlnieuws/data/$year
|
||||
mkdir -p /net/corpora/nlnieuws/data/json/$year
|
||||
cd /net/corpora/nlnieuws/data/$year
|
||||
|
||||
declare -A parts
|
||||
#parts[alles]='.'
|
||||
@@ -68,7 +72,7 @@ do
|
||||
|
||||
for i in 1 4
|
||||
do
|
||||
files=$(find .. $(week2files $ds $i) | grep -E "$regex") || true
|
||||
files=$(find ../.. $(week2files $ds $i) | grep -E "$regex") || true
|
||||
if [ -z "$files" ]
|
||||
then
|
||||
continue
|
||||
@@ -161,10 +165,10 @@ do
|
||||
done
|
||||
done
|
||||
|
||||
data2json $ds 1 > json/DATA-$ds-1.json
|
||||
data2json $ds 4 > json/DATA-$ds-4.json
|
||||
dates2json > json/index1.json
|
||||
dates2json > json/index4.json
|
||||
data2json $ds 1 > ../json/$year/DATA-$ds-1.json
|
||||
data2json $ds 4 > ../json/$year/DATA-$ds-4.json
|
||||
dates2json > ../json/index1.json
|
||||
dates2json > ../json/index4.json
|
||||
|
||||
# rechten bijwerken
|
||||
chmod -R g+w /net/corpora/nlnieuws
|
||||
|
||||
@@ -17,8 +17,11 @@ var (
|
||||
|
||||
func AddEnd(s string) string {
|
||||
s = strings.TrimSpace(s)
|
||||
if s == "" || reEOL.MatchString(s) {
|
||||
return s
|
||||
if s == "" {
|
||||
return ""
|
||||
}
|
||||
if reEOL.MatchString(s) {
|
||||
return s + "\n"
|
||||
}
|
||||
return s + ".\n"
|
||||
}
|
||||
|
||||
12
www/app.js
12
www/app.js
@@ -133,7 +133,9 @@ function makeTD(title, values) {
|
||||
|
||||
async function loadSource(source, week) {
|
||||
if (!data.has(week)) {
|
||||
data[week] = await getJSON('DATA-' + week + '-4.json')
|
||||
data[week] = await getJSON(
|
||||
week.substring(0, 4) + '/DATA-' + week + '-4.json',
|
||||
)
|
||||
}
|
||||
|
||||
idSubtitle.innerHTML = source + ' — t/m ' + data[week].last
|
||||
@@ -154,7 +156,9 @@ async function loadSource(source, week) {
|
||||
|
||||
async function loadPart(part, week) {
|
||||
if (!data.has(week)) {
|
||||
data[week] = await getJSON('DATA-' + week + '-4.json')
|
||||
data[week] = await getJSON(
|
||||
week.substring(0, 4) + '/DATA-' + week + '-4.json',
|
||||
)
|
||||
}
|
||||
|
||||
idSubtitle.innerHTML = part + ' — t/m ' + data[week].last
|
||||
@@ -185,7 +189,9 @@ async function loadWeken(source, part) {
|
||||
if (i < dates.length) {
|
||||
var week = dates[i].week
|
||||
if (!data.has(week)) {
|
||||
data[week] = await getJSON('DATA-' + week + '-4.json')
|
||||
data[week] = await getJSON(
|
||||
week.substring(0, 4) + '/DATA-' + week + '-4.json',
|
||||
)
|
||||
}
|
||||
var values = data[week][source][part]
|
||||
tr.appendChild(makeTD('t/m ' + data[week].last, values))
|
||||
|
||||
Reference in New Issue
Block a user