grote reorganisatie:
- HLN, NOS, NU, VRT: per week -> per dag - yyyy-ww -> yyyy.ww - yyyy* -> yyyy/yyyy* etc
This commit is contained in:
@@ -28,7 +28,7 @@ type Item struct {
|
||||
var (
|
||||
x = e.ExitErr
|
||||
|
||||
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`)
|
||||
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9]$`)
|
||||
)
|
||||
|
||||
func main() {
|
||||
@@ -36,17 +36,17 @@ func main() {
|
||||
var ds string
|
||||
switch len(os.Args) {
|
||||
case 1:
|
||||
year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
|
||||
ds = fmt.Sprintf("%d-%02d", year, week)
|
||||
t := time.Now().AddDate(0, 0, -2)
|
||||
ds = fmt.Sprintf("%d-%02d-%02d", t.Year(), int(t.Month()), t.Day())
|
||||
case 2:
|
||||
if !reYearWeek.MatchString(os.Args[1]) {
|
||||
x(fmt.Errorf("arg must be yyyy-ww"))
|
||||
x(fmt.Errorf("arg must be yyyy-mm-dd"))
|
||||
}
|
||||
ds = os.Args[1]
|
||||
default:
|
||||
x(fmt.Errorf("too many arguments"))
|
||||
}
|
||||
dp := ds[:4] + "/" + ds[5:]
|
||||
dp := strings.ReplaceAll(ds, "-", "/")
|
||||
|
||||
x(os.Chdir("/net/corpora/nlnieuws/NOS/" + dp))
|
||||
x(os.MkdirAll("out", 0777))
|
||||
|
||||
@@ -94,8 +94,7 @@ func main() {
|
||||
}
|
||||
}
|
||||
p(err)
|
||||
year, week := t.ISOWeek()
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/NOS/%d/%02d", year, week)
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/NOS/%d/%02d/%02d", t.Year(), int(t.Month()), t.Day())
|
||||
if exists(dirname + "/lock") {
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -2,17 +2,20 @@
|
||||
|
||||
set -e
|
||||
|
||||
BASE=/net/corpora/nlnieuws
|
||||
PART=$BASE/NOS
|
||||
|
||||
unset CDPATH
|
||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
||||
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
|
||||
export TZ=Europe/Amsterdam
|
||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||
|
||||
if [ "$1" = "" ]
|
||||
then
|
||||
ds=`date -d -7days +%G-%V`
|
||||
ds=`date -d -2days +%Y-%m-%d`
|
||||
else
|
||||
case "$1" in
|
||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
||||
2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9])
|
||||
ds=$1
|
||||
;;
|
||||
*)
|
||||
@@ -23,10 +26,11 @@ else
|
||||
fi
|
||||
|
||||
dp=${ds//-//}
|
||||
year=${ds%%-*}
|
||||
corpus=$PART/corpus/$year/$ds
|
||||
mkdir -p $PART/corpus/$year
|
||||
|
||||
corpus=/net/corpora/nlnieuws/NOS/corpus/$ds
|
||||
|
||||
cd /net/corpora/nlnieuws/NOS/$dp
|
||||
cd $PART/$dp
|
||||
|
||||
ln -s lock.$$ lock
|
||||
if [ "`readlink lock`" != lock.$$ ]
|
||||
@@ -38,7 +42,7 @@ fi
|
||||
rm -fr out
|
||||
mkdir out
|
||||
|
||||
../../json2txt $ds
|
||||
json2txt $ds
|
||||
|
||||
rm -f $corpus.lines
|
||||
for i in out/*.txt
|
||||
@@ -53,15 +57,15 @@ cd out
|
||||
mkdir xml
|
||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||
|
||||
../../../metadata
|
||||
metadata
|
||||
|
||||
cd xml
|
||||
rm -f $corpus.data.dz $corpus.index
|
||||
alto -q -o $corpus.data.dz *.xml
|
||||
|
||||
# telling per bericht, niet per zin
|
||||
/net/corpora/nlnieuws/namen.sh -x C -s $corpus.data.dz > $corpus.cat.txt
|
||||
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
query.sh -x C -s $corpus.data.dz > $corpus.cat.txt
|
||||
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
|
||||
cd ../..
|
||||
rm -fr out
|
||||
|
||||
Reference in New Issue
Block a user