Compare commits
29 Commits
7ce910203d
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a8bea0ab44 | ||
|
|
d7adc17d4b | ||
|
|
a9f9e17acf | ||
|
|
1f4a084624 | ||
|
|
9f29222909 | ||
|
|
a76fa21584 | ||
|
|
efa301cc4a | ||
|
|
14590570ba | ||
|
|
ca4e7af8fa | ||
|
|
66581d4e98 | ||
|
|
e53049e62f | ||
|
|
7f23212fc3 | ||
|
|
5c651387af | ||
|
|
e430ff576b | ||
|
|
9d82f11536 | ||
|
|
650a13eb4a | ||
|
|
bf0407b933 | ||
|
|
fcad105a75 | ||
|
|
75832c3132 | ||
|
|
4b56c0cd70 | ||
|
|
81cc653ee7 | ||
|
|
c0335f5b57 | ||
|
|
0a43773ec8 | ||
|
|
3a12056d5f | ||
|
|
6d1dee6f3a | ||
|
|
4f9284c6d6 | ||
|
|
dbc8f9bfb8 | ||
|
|
b68a77c67a | ||
|
|
24dc3946f4 |
7
.gitignore
vendored
7
.gitignore
vendored
@@ -8,6 +8,8 @@ BuurtGrn/buurtgrn
|
||||
BuurtGrn/metadata
|
||||
GG/gg
|
||||
GG/metadata
|
||||
HLN/metadata
|
||||
HLN/hln
|
||||
LitNL/litnl
|
||||
LitNL/metadata
|
||||
LitNL/xml2txt
|
||||
@@ -36,14 +38,17 @@ Sikkom/sikkom
|
||||
Tzum/metadata
|
||||
Tzum/tzum
|
||||
Tzum/xml2txt
|
||||
Volkskrant/metadata
|
||||
Volkskrant/volkskrant
|
||||
VRT/metadata
|
||||
VRT/vrt
|
||||
bin/data2json
|
||||
bin/dates2json
|
||||
bin/flush
|
||||
bin/items2count
|
||||
bin/score
|
||||
bin/rang
|
||||
bin/top20
|
||||
bin/trends
|
||||
bin/week2files
|
||||
20??
|
||||
corpus
|
||||
|
||||
@@ -3,11 +3,11 @@ all: \
|
||||
metadata \
|
||||
at5
|
||||
|
||||
xml2txt: cmd/xml2txt/*.go
|
||||
go build -o $@ $^
|
||||
xml2txt: cmd/xml2txt/*.go ../internal/util/*.go
|
||||
go build -o $@ $<
|
||||
|
||||
metadata: cmd/metadata/*.go
|
||||
go build -o $@ $^
|
||||
|
||||
at5: cmd/at5/*.go
|
||||
go build -o $@ $^
|
||||
at5: cmd/at5/*.go ../internal/util/*.go
|
||||
go build -o $@ $<
|
||||
|
||||
@@ -3,13 +3,14 @@ package main
|
||||
import (
|
||||
e "codeberg.org/pebbe/errors"
|
||||
|
||||
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
|
||||
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
@@ -46,7 +47,7 @@ func main() {
|
||||
}()
|
||||
|
||||
myLock := "/net/corpora/nlnieuws/AT5/lock"
|
||||
mkLock(myLock)
|
||||
u.MkLock(myLock)
|
||||
defer func() {
|
||||
_ = os.Remove(myLock)
|
||||
}()
|
||||
@@ -76,7 +77,7 @@ func main() {
|
||||
}
|
||||
p(err)
|
||||
year, week := t.ISOWeek()
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/AT5/%d/%02d", year, week)
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/AT5/%d/w%02d", year, week)
|
||||
if exists(dirname + "/lock") {
|
||||
continue
|
||||
}
|
||||
@@ -110,16 +111,3 @@ func main() {
|
||||
}()
|
||||
}
|
||||
}
|
||||
|
||||
func mkLock(filename string) {
|
||||
pid := os.Getpid()
|
||||
link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
|
||||
p(os.Symlink(link, filename))
|
||||
|
||||
name, err := os.Readlink(filename)
|
||||
p(err)
|
||||
|
||||
if name != link {
|
||||
p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,6 +4,8 @@ import (
|
||||
e "codeberg.org/pebbe/errors"
|
||||
"github.com/jbowtie/gokogiri"
|
||||
|
||||
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
|
||||
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"os"
|
||||
@@ -20,7 +22,7 @@ type Item struct {
|
||||
var (
|
||||
x = e.ExitErr
|
||||
|
||||
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`)
|
||||
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]\.[0-5][0-9]$`)
|
||||
)
|
||||
|
||||
func main() {
|
||||
@@ -29,16 +31,16 @@ func main() {
|
||||
switch len(os.Args) {
|
||||
case 1:
|
||||
year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
|
||||
ds = fmt.Sprintf("%d-%02d", year, week)
|
||||
ds = fmt.Sprintf("%d.%02d", year, week)
|
||||
case 2:
|
||||
if !reYearWeek.MatchString(os.Args[1]) {
|
||||
x(fmt.Errorf("arg must be yyyy-ww"))
|
||||
x(fmt.Errorf("arg must be yyyy.ww"))
|
||||
}
|
||||
ds = os.Args[1]
|
||||
default:
|
||||
x(fmt.Errorf("too many arguments"))
|
||||
}
|
||||
dp := ds[:4] + "/" + ds[5:]
|
||||
dp := ds[:4] + "/w" + ds[5:]
|
||||
|
||||
x(os.Chdir("/net/corpora/nlnieuws/AT5/" + dp))
|
||||
x(os.MkdirAll("out", 0777))
|
||||
@@ -55,39 +57,15 @@ func main() {
|
||||
x(err)
|
||||
var item Item
|
||||
x(xml.Unmarshal(b, &item), filename)
|
||||
x(fp.WriteString(addEnd(fixSpace(item.Title))))
|
||||
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
|
||||
x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
|
||||
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + u.HtmlFixString(item.Text) + `</body></html>`))
|
||||
x(err)
|
||||
root := doc.Root()
|
||||
pp, err := root.Search(`//body/p | //body/h2`)
|
||||
x(err)
|
||||
for _, p := range pp {
|
||||
x(fp.WriteString(addEnd(fixSpace(p.Content()))))
|
||||
x(fp.WriteString(u.AddEnd(u.FixSpace(p.Content()))))
|
||||
}
|
||||
x(fp.Close())
|
||||
}
|
||||
}
|
||||
|
||||
func addEnd(s string) string {
|
||||
s = strings.TrimSpace(s)
|
||||
n := len(s)
|
||||
if n == 0 {
|
||||
return ""
|
||||
}
|
||||
if n > 0 {
|
||||
if strings.ContainsAny(s[n-1:], ".!?") {
|
||||
return s + "\n"
|
||||
}
|
||||
}
|
||||
if n > 1 {
|
||||
s2 := s[n-2:]
|
||||
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
|
||||
return s + "\n"
|
||||
}
|
||||
}
|
||||
return s + ".\n"
|
||||
}
|
||||
|
||||
func fixSpace(s string) string {
|
||||
return strings.Join(strings.Fields(s), " ")
|
||||
}
|
||||
|
||||
@@ -2,17 +2,20 @@
|
||||
|
||||
set -e
|
||||
|
||||
BASE=/net/corpora/nlnieuws
|
||||
PART=$BASE/AT5
|
||||
|
||||
unset CDPATH
|
||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
||||
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
|
||||
export TZ=Europe/Amsterdam
|
||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||
|
||||
if [ "$1" = "" ]
|
||||
then
|
||||
ds=`date -d -7days +%G-%V`
|
||||
ds=`date -d -7days +%G.%V`
|
||||
else
|
||||
case "$1" in
|
||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
||||
2[0-9][0-9][0-9].[0-5][0-9])
|
||||
ds=$1
|
||||
;;
|
||||
*)
|
||||
@@ -22,11 +25,13 @@ else
|
||||
esac
|
||||
fi
|
||||
|
||||
dp=${ds//-//}
|
||||
year=${ds%.*}
|
||||
week=${ds#*.}
|
||||
dp=$year/w$week
|
||||
corpus=$PART/corpus/$year/$ds
|
||||
mkdir -p $PART/corpus/$year
|
||||
|
||||
corpus=/net/corpora/nlnieuws/AT5/corpus/$ds
|
||||
|
||||
cd /net/corpora/nlnieuws/AT5/$dp
|
||||
cd $PART/$dp
|
||||
|
||||
ln -s lock.$$ lock
|
||||
if [ "`readlink lock`" != lock.$$ ]
|
||||
@@ -38,7 +43,7 @@ fi
|
||||
rm -fr out
|
||||
mkdir out
|
||||
|
||||
../../xml2txt $ds
|
||||
xml2txt $ds
|
||||
|
||||
rm -f $corpus.lines
|
||||
for i in out/*.txt
|
||||
@@ -53,7 +58,7 @@ cd out
|
||||
mkdir xml
|
||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||
|
||||
../../../metadata
|
||||
metadata
|
||||
|
||||
cd xml
|
||||
rm -f $corpus.data.dz $corpus.index
|
||||
|
||||
@@ -5,5 +5,5 @@ all: \
|
||||
metadata: cmd/metadata/*.go
|
||||
go build -o $@ $^
|
||||
|
||||
buurtadam: cmd/buurtadam/*.go
|
||||
go build -o $@ $^
|
||||
buurtadam: cmd/buurtadam/*.go ../internal/util/*.go
|
||||
go build -o $@ $<
|
||||
|
||||
@@ -4,13 +4,14 @@ import (
|
||||
e "codeberg.org/pebbe/errors"
|
||||
"github.com/jbowtie/gokogiri"
|
||||
|
||||
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
|
||||
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
@@ -64,7 +65,7 @@ func main() {
|
||||
}()
|
||||
|
||||
myLock := "/net/corpora/nlnieuws/BuurtAdam/lock"
|
||||
mkLock(myLock)
|
||||
u.MkLock(myLock)
|
||||
defer func() {
|
||||
_ = os.Remove(myLock)
|
||||
}()
|
||||
@@ -94,7 +95,7 @@ func main() {
|
||||
}
|
||||
p(err)
|
||||
year, week := t.ISOWeek()
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/BuurtAdam/%d/%02d", year, week)
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/BuurtAdam/%d/w%02d", year, week)
|
||||
if exists(dirname + "/lock") {
|
||||
continue
|
||||
}
|
||||
@@ -158,6 +159,8 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
|
||||
p(err)
|
||||
p(resp.Body.Close())
|
||||
|
||||
body = u.HtmlFix(body)
|
||||
|
||||
doc, err := gokogiri.ParseHtml(body)
|
||||
p(err)
|
||||
|
||||
@@ -202,7 +205,7 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
|
||||
}
|
||||
|
||||
for _, div := range divs {
|
||||
p(fp.WriteString(addEnd(fixSpace(div.Content()))))
|
||||
p(fp.WriteString(u.AddEnd(u.FixSpace(div.Content()))))
|
||||
}
|
||||
|
||||
p(fp.Close())
|
||||
@@ -211,40 +214,3 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
func addEnd(s string) string {
|
||||
s = strings.TrimSpace(s)
|
||||
n := len(s)
|
||||
if n == 0 {
|
||||
return ""
|
||||
}
|
||||
if n > 0 {
|
||||
if strings.ContainsAny(s[n-1:], ".!?") {
|
||||
return s + "\n"
|
||||
}
|
||||
}
|
||||
if n > 1 {
|
||||
s2 := s[n-2:]
|
||||
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
|
||||
return s + "\n"
|
||||
}
|
||||
}
|
||||
return s + ".\n"
|
||||
}
|
||||
|
||||
func fixSpace(s string) string {
|
||||
return strings.Join(strings.Fields(s), " ")
|
||||
}
|
||||
|
||||
func mkLock(filename string) {
|
||||
pid := os.Getpid()
|
||||
link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
|
||||
p(os.Symlink(link, filename))
|
||||
|
||||
name, err := os.Readlink(filename)
|
||||
p(err)
|
||||
|
||||
if name != link {
|
||||
p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,17 +2,20 @@
|
||||
|
||||
set -e
|
||||
|
||||
BASE=/net/corpora/nlnieuws
|
||||
PART=$BASE/BuurtAdam
|
||||
|
||||
unset CDPATH
|
||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
||||
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
|
||||
export TZ=Europe/Amsterdam
|
||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||
|
||||
if [ "$1" = "" ]
|
||||
then
|
||||
ds=`date -d -7days +%G-%V`
|
||||
ds=`date -d -7days +%G.%V`
|
||||
else
|
||||
case "$1" in
|
||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
||||
2[0-9][0-9][0-9].[0-5][0-9])
|
||||
ds=$1
|
||||
;;
|
||||
*)
|
||||
@@ -22,11 +25,13 @@ else
|
||||
esac
|
||||
fi
|
||||
|
||||
dp=${ds//-//}
|
||||
year=${ds%.*}
|
||||
week=${ds#*.}
|
||||
dp=$year/w$week
|
||||
corpus=$PART/corpus/$year/$ds
|
||||
mkdir -p $PART/corpus/$year
|
||||
|
||||
corpus=/net/corpora/nlnieuws/BuurtAdam/corpus/$ds
|
||||
|
||||
cd /net/corpora/nlnieuws/BuurtAdam/$dp
|
||||
cd $PART/$dp
|
||||
|
||||
ln -s lock.$$ lock
|
||||
if [ "`readlink lock`" != lock.$$ ]
|
||||
@@ -51,14 +56,14 @@ cd out
|
||||
mkdir xml
|
||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||
|
||||
../../../metadata
|
||||
metadata
|
||||
|
||||
cd xml
|
||||
rm -f $corpus.data.dz $corpus.index
|
||||
alto -q -o $corpus.data.dz *.xml
|
||||
|
||||
# telling per bericht, niet per zin
|
||||
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
|
||||
cd ../..
|
||||
rm -fr out
|
||||
|
||||
@@ -5,5 +5,5 @@ all: \
|
||||
metadata: cmd/metadata/*.go
|
||||
go build -o $@ $^
|
||||
|
||||
buurtgrn: cmd/buurtgrn/*.go
|
||||
go build -o $@ $^
|
||||
buurtgrn: cmd/buurtgrn/*.go ../internal/util/*.go
|
||||
go build -o $@ $<
|
||||
|
||||
@@ -6,11 +6,11 @@ import (
|
||||
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
@@ -64,7 +64,7 @@ func main() {
|
||||
}()
|
||||
|
||||
myLock := "/net/corpora/nlnieuws/BuurtGrn/lock"
|
||||
mkLock(myLock)
|
||||
u.MkLock(myLock)
|
||||
defer func() {
|
||||
_ = os.Remove(myLock)
|
||||
}()
|
||||
@@ -94,7 +94,7 @@ func main() {
|
||||
}
|
||||
p(err)
|
||||
year, week := t.ISOWeek()
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/BuurtGrn/%d/%02d", year, week)
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/BuurtGrn/%d/w%02d", year, week)
|
||||
if exists(dirname + "/lock") {
|
||||
continue
|
||||
}
|
||||
@@ -158,6 +158,8 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
|
||||
p(err)
|
||||
p(resp.Body.Close())
|
||||
|
||||
body = u.HtmlFix(body)
|
||||
|
||||
doc, err := gokogiri.ParseHtml(body)
|
||||
p(err)
|
||||
|
||||
@@ -202,7 +204,7 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
|
||||
}
|
||||
|
||||
for _, div := range divs {
|
||||
p(fp.WriteString(addEnd(fixSpace(div.Content()))))
|
||||
p(fp.WriteString(u.AddEnd(u.FixSpace(div.Content()))))
|
||||
}
|
||||
|
||||
p(fp.Close())
|
||||
@@ -211,40 +213,3 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
func addEnd(s string) string {
|
||||
s = strings.TrimSpace(s)
|
||||
n := len(s)
|
||||
if n == 0 {
|
||||
return ""
|
||||
}
|
||||
if n > 0 {
|
||||
if strings.ContainsAny(s[n-1:], ".!?") {
|
||||
return s + "\n"
|
||||
}
|
||||
}
|
||||
if n > 1 {
|
||||
s2 := s[n-2:]
|
||||
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
|
||||
return s + "\n"
|
||||
}
|
||||
}
|
||||
return s + ".\n"
|
||||
}
|
||||
|
||||
func fixSpace(s string) string {
|
||||
return strings.Join(strings.Fields(s), " ")
|
||||
}
|
||||
|
||||
func mkLock(filename string) {
|
||||
pid := os.Getpid()
|
||||
link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
|
||||
p(os.Symlink(link, filename))
|
||||
|
||||
name, err := os.Readlink(filename)
|
||||
p(err)
|
||||
|
||||
if name != link {
|
||||
p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,17 +2,20 @@
|
||||
|
||||
set -e
|
||||
|
||||
BASE=/net/corpora/nlnieuws
|
||||
PART=$BASE/BuurtGrn
|
||||
|
||||
unset CDPATH
|
||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
||||
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
|
||||
export TZ=Europe/Amsterdam
|
||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||
|
||||
if [ "$1" = "" ]
|
||||
then
|
||||
ds=`date -d -7days +%G-%V`
|
||||
ds=`date -d -7days +%G.%V`
|
||||
else
|
||||
case "$1" in
|
||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
||||
2[0-9][0-9][0-9].[0-5][0-9])
|
||||
ds=$1
|
||||
;;
|
||||
*)
|
||||
@@ -22,11 +25,13 @@ else
|
||||
esac
|
||||
fi
|
||||
|
||||
dp=${ds//-//}
|
||||
year=${ds%.*}
|
||||
week=${ds#*.}
|
||||
dp=$year/w$week
|
||||
corpus=$PART/corpus/$year/$ds
|
||||
mkdir -p $PART/corpus/$year
|
||||
|
||||
corpus=/net/corpora/nlnieuws/BuurtGrn/corpus/$ds
|
||||
|
||||
cd /net/corpora/nlnieuws/BuurtGrn/$dp
|
||||
cd $PART/$dp
|
||||
|
||||
ln -s lock.$$ lock
|
||||
if [ "`readlink lock`" != lock.$$ ]
|
||||
@@ -51,14 +56,14 @@ cd out
|
||||
mkdir xml
|
||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||
|
||||
../../../metadata
|
||||
metadata
|
||||
|
||||
cd xml
|
||||
rm -f $corpus.data.dz $corpus.index
|
||||
alto -q -o $corpus.data.dz *.xml
|
||||
|
||||
# telling per bericht, niet per zin
|
||||
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
|
||||
cd ../..
|
||||
rm -fr out
|
||||
|
||||
@@ -5,5 +5,5 @@ all: \
|
||||
metadata: cmd/metadata/*.go
|
||||
go build -o $@ $^
|
||||
|
||||
gg: cmd/gg/*.go
|
||||
go build -o $@ $^
|
||||
gg: cmd/gg/*.go ../internal/util/*.go
|
||||
go build -o $@ $<
|
||||
|
||||
@@ -4,13 +4,14 @@ import (
|
||||
e "codeberg.org/pebbe/errors"
|
||||
"github.com/jbowtie/gokogiri"
|
||||
|
||||
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
|
||||
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
@@ -64,7 +65,7 @@ func main() {
|
||||
}()
|
||||
|
||||
myLock := "/net/corpora/nlnieuws/GG/lock"
|
||||
mkLock(myLock)
|
||||
u.MkLock(myLock)
|
||||
defer func() {
|
||||
_ = os.Remove(myLock)
|
||||
}()
|
||||
@@ -94,7 +95,7 @@ func main() {
|
||||
}
|
||||
p(err)
|
||||
year, week := t.ISOWeek()
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/GG/%d/%02d", year, week)
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/GG/%d/w%02d", year, week)
|
||||
if exists(dirname + "/lock") {
|
||||
continue
|
||||
}
|
||||
@@ -154,6 +155,8 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
|
||||
p(err)
|
||||
p(resp.Body.Close())
|
||||
|
||||
body = u.HtmlFix(body)
|
||||
|
||||
doc, err := gokogiri.ParseHtml(body)
|
||||
p(err)
|
||||
|
||||
@@ -191,10 +194,10 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
|
||||
fp, err := os.Create(filename + ".txt")
|
||||
p(err)
|
||||
|
||||
p(fp.WriteString(addEnd(fixSpace(title))))
|
||||
p(fp.WriteString(u.AddEnd(u.FixSpace(title))))
|
||||
|
||||
for _, el := range ell {
|
||||
p(fp.WriteString(addEnd(fixSpace(el.Content()))))
|
||||
p(fp.WriteString(u.AddEnd(u.FixSpace(el.Content()))))
|
||||
}
|
||||
|
||||
p(fp.Close())
|
||||
@@ -203,40 +206,3 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
func addEnd(s string) string {
|
||||
s = strings.TrimSpace(s)
|
||||
n := len(s)
|
||||
if n == 0 {
|
||||
return ""
|
||||
}
|
||||
if n > 0 {
|
||||
if strings.ContainsAny(s[n-1:], ".!?") {
|
||||
return s + "\n"
|
||||
}
|
||||
}
|
||||
if n > 1 {
|
||||
s2 := s[n-2:]
|
||||
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
|
||||
return s + "\n"
|
||||
}
|
||||
}
|
||||
return s + ".\n"
|
||||
}
|
||||
|
||||
func fixSpace(s string) string {
|
||||
return strings.Join(strings.Fields(s), " ")
|
||||
}
|
||||
|
||||
func mkLock(filename string) {
|
||||
pid := os.Getpid()
|
||||
link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
|
||||
p(os.Symlink(link, filename))
|
||||
|
||||
name, err := os.Readlink(filename)
|
||||
p(err)
|
||||
|
||||
if name != link {
|
||||
p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,17 +2,20 @@
|
||||
|
||||
set -e
|
||||
|
||||
BASE=/net/corpora/nlnieuws
|
||||
PART=$BASE/GG
|
||||
|
||||
unset CDPATH
|
||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
||||
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
|
||||
export TZ=Europe/Amsterdam
|
||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||
|
||||
if [ "$1" = "" ]
|
||||
then
|
||||
ds=`date -d -7days +%G-%V`
|
||||
ds=`date -d -7days +%G.%V`
|
||||
else
|
||||
case "$1" in
|
||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
||||
2[0-9][0-9][0-9].[0-5][0-9])
|
||||
ds=$1
|
||||
;;
|
||||
*)
|
||||
@@ -22,11 +25,13 @@ else
|
||||
esac
|
||||
fi
|
||||
|
||||
dp=${ds//-//}
|
||||
year=${ds%.*}
|
||||
week=${ds#*.}
|
||||
dp=$year/w$week
|
||||
corpus=$PART/corpus/$year/$ds
|
||||
mkdir -p $PART/corpus/$year
|
||||
|
||||
corpus=/net/corpora/nlnieuws/GG/corpus/$ds
|
||||
|
||||
cd /net/corpora/nlnieuws/GG/$dp
|
||||
cd $PART/$dp
|
||||
|
||||
ln -s lock.$$ lock
|
||||
if [ "`readlink lock`" != lock.$$ ]
|
||||
@@ -51,7 +56,7 @@ cd out
|
||||
mkdir xml
|
||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||
|
||||
../../../metadata
|
||||
metadata
|
||||
|
||||
cd xml
|
||||
rm -f $corpus.data.dz $corpus.index
|
||||
|
||||
9
HLN/Makefile
Normal file
9
HLN/Makefile
Normal file
@@ -0,0 +1,9 @@
|
||||
all: \
|
||||
metadata \
|
||||
hln
|
||||
|
||||
metadata: cmd/metadata/*.go
|
||||
go build -o $@ $^
|
||||
|
||||
hln: cmd/hln/*.go ../internal/util/*.go
|
||||
go build -o $@ $<
|
||||
357
HLN/cmd/hln/hln.go
Normal file
357
HLN/cmd/hln/hln.go
Normal file
@@ -0,0 +1,357 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
e "codeberg.org/pebbe/errors"
|
||||
"github.com/jbowtie/gokogiri"
|
||||
|
||||
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
|
||||
|
||||
// "encoding/json"
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
// "html"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type Rss struct {
|
||||
XMLName xml.Name `xml:"rss"`
|
||||
Items []ItemT `xml:"channel>item"`
|
||||
}
|
||||
|
||||
type ItemT struct {
|
||||
PubDate string `xml:"pubDate"`
|
||||
UnixTime int64 `xml:"unixTime"`
|
||||
Guid string `xml:"guid"`
|
||||
Link string `xml:"link"`
|
||||
Title string `xml:"title"`
|
||||
Data []byte `xml:",innerxml"`
|
||||
}
|
||||
|
||||
/*
|
||||
type GraphT struct {
|
||||
Graph []map[string]any `json:"@graph"`
|
||||
}
|
||||
*/
|
||||
|
||||
var (
|
||||
p = e.PanicErr
|
||||
w = e.WarnErr
|
||||
agent = "AhrefsBot/7.0"
|
||||
)
|
||||
|
||||
func exists(filename string) bool {
|
||||
_, err := os.Stat(filename)
|
||||
return err == nil
|
||||
}
|
||||
|
||||
func fileDate(filename string) string {
|
||||
b, err := os.ReadFile(filename)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
s := string(b)
|
||||
i1 := strings.Index(s, "<unixTime>") + 10
|
||||
i2 := strings.Index(s, "</unixTime>")
|
||||
if i2 < i1 {
|
||||
return ""
|
||||
}
|
||||
return s[i1:i2]
|
||||
}
|
||||
|
||||
func main() {
|
||||
defer func() {
|
||||
if e.Panicked {
|
||||
_ = recover()
|
||||
os.Exit(1)
|
||||
}
|
||||
}()
|
||||
|
||||
myLock := "/net/corpora/nlnieuws/HLN/lock"
|
||||
u.MkLock(myLock)
|
||||
defer func() {
|
||||
_ = os.Remove(myLock)
|
||||
}()
|
||||
|
||||
req, err := http.NewRequest("GET", "https://www.hln.be/home/rss.xml", nil)
|
||||
p(err)
|
||||
req.Header.Set("User-Agent", agent)
|
||||
|
||||
client := &http.Client{}
|
||||
resp, err := client.Do(req)
|
||||
p(err)
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
p(err)
|
||||
p(resp.Body.Close())
|
||||
|
||||
var rss Rss
|
||||
p(xml.Unmarshal(body, &rss))
|
||||
|
||||
if len(rss.Items) == 0 {
|
||||
p(fmt.Errorf("len(rss.Items) == 0"))
|
||||
}
|
||||
|
||||
for _, item := range rss.Items {
|
||||
t, err := time.Parse(time.RFC1123Z, item.PubDate)
|
||||
if err != nil {
|
||||
t, err = time.Parse(time.RFC1123, item.PubDate)
|
||||
}
|
||||
p(err)
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/HLN/%d/%02d/%02d", t.Year(), int(t.Month()), t.Day())
|
||||
if exists(dirname + "/lock") {
|
||||
continue
|
||||
}
|
||||
basename := strings.TrimPrefix(item.Guid, "https://www.hln.be/")
|
||||
basename = strings.TrimSuffix(basename, "/")
|
||||
if n, i := len(basename), strings.Index(basename, "~"); i < n-1 && i > 0 {
|
||||
basename = basename[i+1:]
|
||||
}
|
||||
|
||||
filename := dirname + "/" + url.PathEscape(basename)
|
||||
|
||||
ts := fmt.Sprintf("%d", t.Unix())
|
||||
needUpdate := fileDate(filename+".xml") != ts
|
||||
|
||||
p(os.MkdirAll(dirname, 0777))
|
||||
func() {
|
||||
var ok bool
|
||||
defer func() {
|
||||
if e.Panicked {
|
||||
fmt.Fprintln(os.Stderr, "----", filename)
|
||||
fmt.Fprintln(os.Stderr, "----", item.Link)
|
||||
}
|
||||
if !ok {
|
||||
_ = os.Remove(filename + ".xml")
|
||||
}
|
||||
}()
|
||||
fp, err := os.Create(filename + ".xml")
|
||||
p(err)
|
||||
p(fp.WriteString("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<item>\n"))
|
||||
p(fmt.Fprintf(fp, "<unixTime>%d</unixTime>", t.Unix()))
|
||||
p(fp.Write(item.Data))
|
||||
p(fp.WriteString("</item>\n"))
|
||||
p(fp.Close())
|
||||
p(os.Chtimes(filename+".xml", t, t))
|
||||
ok = doArticle(filename, item.Link, item.Title, t, needUpdate)
|
||||
}()
|
||||
}
|
||||
}
|
||||
|
||||
func doArticle(filename string, url string, title string, timestamp time.Time, needUpdate bool) (ok bool) {
|
||||
if exists(filename + ".skip") {
|
||||
return true
|
||||
}
|
||||
if needUpdate {
|
||||
_ = os.Remove(filename + ".err")
|
||||
_ = os.Remove(filename + ".html")
|
||||
// _ = os.Remove(filename + ".json")
|
||||
_ = os.Remove(filename + ".txt")
|
||||
} else {
|
||||
if exists(filename + ".txt") {
|
||||
return true
|
||||
}
|
||||
}
|
||||
time.Sleep(2 * time.Second)
|
||||
|
||||
req, err := http.NewRequest("GET", url, nil)
|
||||
p(err)
|
||||
req.Header.Set("User-Agent", agent)
|
||||
|
||||
client := &http.Client{}
|
||||
resp, err := client.Do(req)
|
||||
p(err)
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
p(err)
|
||||
p(resp.Body.Close())
|
||||
|
||||
body = u.HtmlFix(body)
|
||||
|
||||
/*
|
||||
s := string(body)
|
||||
ok = true
|
||||
i1 := strings.Index(s, `type="application/ld+json"`)
|
||||
if i1 < 0 {
|
||||
ok = false
|
||||
} else {
|
||||
i1 += strings.Index(s[i1:], `>`) + 1
|
||||
i2 := i1 + strings.Index(s[i1:], `</script>`)
|
||||
if i2 < i1 {
|
||||
ok = false
|
||||
} else {
|
||||
s = html.UnescapeString(s[i1:i2])
|
||||
}
|
||||
}
|
||||
if !ok {
|
||||
_ = w(fmt.Errorf("script jsonld not found: %s", url))
|
||||
|
||||
fp, err := os.Create(filename + ".err")
|
||||
p(err)
|
||||
p(fmt.Fprintf(fp, "script jsonld not found: %s\n", url))
|
||||
p(fp.Close())
|
||||
p(os.Chtimes(filename+".err", timestamp, timestamp))
|
||||
|
||||
fp, err = os.Create(filename + ".html")
|
||||
p(err)
|
||||
p(fp.Write(body))
|
||||
p(fp.Close())
|
||||
p(os.Chtimes(filename+".html", timestamp, timestamp))
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
var graph GraphT
|
||||
p(json.Unmarshal([]byte(s), &graph))
|
||||
for _, g := range graph.Graph {
|
||||
t := g["@type"]
|
||||
switch v := t.(type) {
|
||||
case string:
|
||||
if v == "NewsArticle" {
|
||||
b, err := json.Marshal(g)
|
||||
p(err)
|
||||
s = string(b)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fp, err := os.Create(filename + ".json")
|
||||
p(err)
|
||||
p(fp.WriteString(s))
|
||||
p(fp.Close())
|
||||
p(os.Chtimes(filename+".json", timestamp, timestamp))
|
||||
*/
|
||||
|
||||
doc, err := gokogiri.ParseHtml(body)
|
||||
p(err)
|
||||
|
||||
root := doc.Root()
|
||||
|
||||
articles, err := root.Search(`//article[@id="article-content"]`)
|
||||
p(err)
|
||||
if len(articles) == 0 {
|
||||
_ = w(fmt.Errorf("empty: %s", url))
|
||||
|
||||
fp, err := os.Create(filename + ".err")
|
||||
p(err)
|
||||
p(fmt.Fprintf(fp, "empty: %s\n", url))
|
||||
p(fp.Close())
|
||||
p(os.Chtimes(filename+".err", timestamp, timestamp))
|
||||
|
||||
fp, err = os.Create(filename + ".html")
|
||||
p(err)
|
||||
p(fp.Write(body))
|
||||
p(fp.Close())
|
||||
p(os.Chtimes(filename+".html", timestamp, timestamp))
|
||||
|
||||
return false
|
||||
}
|
||||
article := articles[0]
|
||||
|
||||
tags := make([]string, 0)
|
||||
ell, err := article.Search(`.//*[@data-content-type="LABEL"]`)
|
||||
p(err)
|
||||
for _, el := range ell {
|
||||
s := strings.TrimSpace(el.Content())
|
||||
if s != "" {
|
||||
tags = append(tags, s)
|
||||
}
|
||||
}
|
||||
|
||||
pars := make([]string, 0)
|
||||
|
||||
hasIntro := false
|
||||
ell, err = article.Search(`.//*[@data-content-type="INTRO"]`)
|
||||
p(err)
|
||||
for _, el := range ell {
|
||||
s := strings.TrimSpace(el.Content())
|
||||
if s != "" {
|
||||
pars = append(pars, s)
|
||||
hasIntro = true
|
||||
}
|
||||
}
|
||||
if !hasIntro {
|
||||
_ = w(fmt.Errorf("no intro: %s", url))
|
||||
}
|
||||
|
||||
specials, err := article.Search(`.//*[@data-content-type="GROUP"]`)
|
||||
p(err)
|
||||
for i := len(specials) - 1; i >= 0; i-- {
|
||||
specials[i].Remove()
|
||||
}
|
||||
|
||||
other, err := article.Search(`.//*[@data-content-type="PODCAST"]`)
|
||||
p(err)
|
||||
hasOther := len(other) > 0
|
||||
|
||||
ell, err = article.Search(`.//*[@data-content-type="PARAGRAPH"]`)
|
||||
p(err)
|
||||
if len(ell) == 0 && !hasOther && !hasIntro {
|
||||
_ = w(fmt.Errorf("no paragraphs: %s", url))
|
||||
|
||||
fp, err := os.Create(filename + ".err")
|
||||
p(err)
|
||||
p(fmt.Fprintf(fp, "no paragraphs: %s\n", url))
|
||||
p(fp.Close())
|
||||
p(os.Chtimes(filename+".err", timestamp, timestamp))
|
||||
|
||||
fp, err = os.Create(filename + ".html")
|
||||
p(err)
|
||||
p(fp.Write(body))
|
||||
p(fp.Close())
|
||||
p(os.Chtimes(filename+".html", timestamp, timestamp))
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
hasPar := false
|
||||
for _, el := range ell {
|
||||
s := strings.TrimSpace(el.Content())
|
||||
if s != "" {
|
||||
pars = append(pars, s)
|
||||
hasPar = true
|
||||
}
|
||||
}
|
||||
if !hasPar {
|
||||
if !hasOther && !hasIntro {
|
||||
_ = w(fmt.Errorf("no text, skipping: %s", url))
|
||||
}
|
||||
fp, err := os.Create(filename + ".skip")
|
||||
p(fp.WriteString(url + "\n"))
|
||||
p(err)
|
||||
p(os.Chtimes(filename+".skip", timestamp, timestamp))
|
||||
|
||||
fp, err = os.Create(filename + ".html")
|
||||
p(err)
|
||||
p(fp.Write(body))
|
||||
p(fp.Close())
|
||||
p(os.Chtimes(filename+".html", timestamp, timestamp))
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
fp, err := os.Create(filename + ".txt")
|
||||
p(err)
|
||||
|
||||
if len(tags) == 0 {
|
||||
p(fmt.Fprintln(fp, "##META text tag ="))
|
||||
} else {
|
||||
for _, tag := range tags {
|
||||
p(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(tag)))
|
||||
}
|
||||
}
|
||||
|
||||
p(fp.WriteString(u.AddEnd(u.FixSpace(title))))
|
||||
|
||||
for _, par := range pars {
|
||||
p(fp.WriteString(u.AddEnd(u.FixSpace(par))))
|
||||
}
|
||||
|
||||
p(fp.Close())
|
||||
|
||||
p(os.Chtimes(filename+".txt", timestamp, timestamp))
|
||||
|
||||
return true
|
||||
}
|
||||
131
HLN/cmd/metadata/metadata.go
Normal file
131
HLN/cmd/metadata/metadata.go
Normal file
@@ -0,0 +1,131 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
e "codeberg.org/pebbe/errors"
|
||||
|
||||
"bufio"
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"html"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type Item struct {
|
||||
XMLName xml.Name `xml:"item"`
|
||||
UnixTime int64 `xml:"unixTime"`
|
||||
}
|
||||
|
||||
var (
|
||||
x = e.ExitErr
|
||||
escape = html.EscapeString
|
||||
data = make(map[string][]string)
|
||||
location *time.Location
|
||||
)
|
||||
|
||||
func main() {
|
||||
var err error
|
||||
location, err = time.LoadLocation("Europe/Amsterdam")
|
||||
x(err)
|
||||
|
||||
files, err := os.ReadDir(".")
|
||||
x(err)
|
||||
for _, file := range files {
|
||||
filename := file.Name()
|
||||
if strings.HasSuffix(filename, ".txt") {
|
||||
doText("", filename)
|
||||
} else if strings.HasSuffix(filename, ".xml") {
|
||||
doXml("", filename)
|
||||
}
|
||||
}
|
||||
files, err = os.ReadDir("..")
|
||||
x(err)
|
||||
for _, file := range files {
|
||||
filename := file.Name()
|
||||
if strings.HasSuffix(filename, ".txt") {
|
||||
doText("../", filename)
|
||||
} else if strings.HasSuffix(filename, ".xml") {
|
||||
doXml("../", filename)
|
||||
}
|
||||
}
|
||||
|
||||
files, err = os.ReadDir("xml")
|
||||
x(err)
|
||||
for _, file := range files {
|
||||
filename := file.Name()
|
||||
if !strings.HasSuffix(filename, ".xml") {
|
||||
continue
|
||||
}
|
||||
aa := strings.Split(filename, ".")
|
||||
base := strings.Join(aa[1:len(aa)-2], ".")
|
||||
b, err := os.ReadFile("xml/" + filename)
|
||||
x(err)
|
||||
s := string(b)
|
||||
i := strings.Index(s, "<alpino") + 1
|
||||
i += strings.Index(s[i:], "<")
|
||||
fp, err := os.Create("xml/" + filename + ".tmp")
|
||||
x(err)
|
||||
x(fp.WriteString(s[:i]))
|
||||
x(fp.WriteString("<metadata>\n <meta type=\"text\" name=\"source\" value=\"HLN\"/>\n"))
|
||||
for _, m := range data[base] {
|
||||
x(fp.WriteString(" " + m + "\n"))
|
||||
}
|
||||
x(fp.WriteString(" </metadata>\n "))
|
||||
x(fp.WriteString(stripMeta(s[i:])))
|
||||
x(fp.Close())
|
||||
x(os.Rename("xml/"+filename+".tmp", "xml/"+filename))
|
||||
}
|
||||
}
|
||||
|
||||
func doText(dirname, filename string) {
|
||||
base := filename[:len(filename)-4]
|
||||
if _, ok := data[base]; !ok {
|
||||
data[base] = make([]string, 0)
|
||||
}
|
||||
fp, err := os.Open(dirname + filename)
|
||||
x(err)
|
||||
defer func() { x(fp.Close()) }()
|
||||
scanner := bufio.NewScanner(fp)
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
if !strings.HasPrefix(line, "##META") {
|
||||
continue
|
||||
}
|
||||
aa := strings.Fields(line)
|
||||
if len(aa) > 4 {
|
||||
data[base] = append(data[base],
|
||||
fmt.Sprintf(`<meta type="%s" name="%s" value="%s"/>`,
|
||||
aa[1],
|
||||
escape(aa[2]),
|
||||
escape(strings.Join(aa[4:], " "))))
|
||||
}
|
||||
}
|
||||
x(scanner.Err())
|
||||
}
|
||||
|
||||
func doXml(dirname, filename string) {
|
||||
base := filename[:len(filename)-4]
|
||||
if _, ok := data[base]; !ok {
|
||||
data[base] = make([]string, 0)
|
||||
}
|
||||
b, err := os.ReadFile(dirname + filename)
|
||||
x(err)
|
||||
var item Item
|
||||
x(xml.Unmarshal(b, &item))
|
||||
t := time.Unix(item.UnixTime, 0).In(location)
|
||||
data[base] = append(data[base],
|
||||
fmt.Sprintf(`<meta type="date" name="pubdate" value="%d-%02d-%02d"/>`,
|
||||
t.Year(),
|
||||
int(t.Month()),
|
||||
t.Day()))
|
||||
}
|
||||
|
||||
func stripMeta(s string) string {
|
||||
i1 := strings.Index(s, "<metadata>")
|
||||
if i1 < 0 {
|
||||
return s
|
||||
}
|
||||
i2 := i1 + strings.Index(s[i1:], "</metadata>") + 11
|
||||
return s[:i1] + strings.TrimLeft(s[i2:], " \t\r\n")
|
||||
}
|
||||
70
HLN/txt2corpus.sh
Executable file
70
HLN/txt2corpus.sh
Executable file
@@ -0,0 +1,70 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
BASE=/net/corpora/nlnieuws
|
||||
PART=$BASE/HLN
|
||||
|
||||
unset CDPATH
|
||||
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
|
||||
export TZ=Europe/Amsterdam
|
||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||
|
||||
if [ "$1" = "" ]
|
||||
then
|
||||
ds=`date -d -2days +%Y-%m-%d`
|
||||
else
|
||||
case "$1" in
|
||||
2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9])
|
||||
ds=$1
|
||||
;;
|
||||
*)
|
||||
echo INVALID
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
fi
|
||||
|
||||
dp=${ds//-//}
|
||||
year=${ds%%-*}
|
||||
corpus=$PART/corpus/$year/$ds
|
||||
mkdir -p $PART/corpus/$year
|
||||
|
||||
cd $PART/$dp
|
||||
|
||||
ln -s lock.$$ lock
|
||||
if [ "`readlink lock`" != lock.$$ ]
|
||||
then
|
||||
echo Getting lock failed
|
||||
exit 1
|
||||
fi
|
||||
|
||||
rm -fr out
|
||||
mkdir out
|
||||
|
||||
rm -f $corpus.lines
|
||||
for i in *.txt
|
||||
do
|
||||
b=`basename $i .txt`
|
||||
perl -p -e 's/^\s*//; s/^##META.*\n//' $i | tokenize.sh \
|
||||
| perl -e '$n = 0; while(<>) { $n++; print("hln.'$b'.$n|$_"); }' \
|
||||
>> $corpus.lines
|
||||
done
|
||||
|
||||
cd out
|
||||
mkdir xml
|
||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||
|
||||
metadata
|
||||
|
||||
cd xml
|
||||
rm -f $corpus.data.dz $corpus.index
|
||||
alto -q -o $corpus.data.dz *.xml
|
||||
|
||||
# telling per bericht, niet per zin
|
||||
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
|
||||
cd ../..
|
||||
rm -fr out
|
||||
|
||||
rm -f lock
|
||||
@@ -3,11 +3,11 @@ all: \
|
||||
metadata \
|
||||
litnl
|
||||
|
||||
xml2txt: cmd/xml2txt/*.go
|
||||
go build -o $@ $^
|
||||
xml2txt: cmd/xml2txt/*.go ../internal/util/*.go
|
||||
go build -o $@ $<
|
||||
|
||||
metadata: cmd/metadata/*.go
|
||||
go build -o $@ $^
|
||||
|
||||
litnl: cmd/litnl/*.go
|
||||
go build -o $@ $^
|
||||
litnl: cmd/litnl/*.go ../internal/util/*.go
|
||||
go build -o $@ $<
|
||||
|
||||
@@ -3,13 +3,14 @@ package main
|
||||
import (
|
||||
e "codeberg.org/pebbe/errors"
|
||||
|
||||
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
|
||||
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
@@ -46,7 +47,7 @@ func main() {
|
||||
}()
|
||||
|
||||
myLock := "/net/corpora/nlnieuws/LitNL/lock"
|
||||
mkLock(myLock)
|
||||
u.MkLock(myLock)
|
||||
defer func() {
|
||||
_ = os.Remove(myLock)
|
||||
}()
|
||||
@@ -76,7 +77,7 @@ func main() {
|
||||
}
|
||||
p(err)
|
||||
year, week := t.ISOWeek()
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/LitNL/%d/%02d", year, week)
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/LitNL/%d/w%02d", year, week)
|
||||
if exists(dirname + "/lock") {
|
||||
continue
|
||||
}
|
||||
@@ -108,16 +109,3 @@ func main() {
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
func mkLock(filename string) {
|
||||
pid := os.Getpid()
|
||||
link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
|
||||
p(os.Symlink(link, filename))
|
||||
|
||||
name, err := os.Readlink(filename)
|
||||
p(err)
|
||||
|
||||
if name != link {
|
||||
p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,6 +4,8 @@ import (
|
||||
e "codeberg.org/pebbe/errors"
|
||||
"github.com/jbowtie/gokogiri"
|
||||
|
||||
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
|
||||
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"os"
|
||||
@@ -22,7 +24,7 @@ var (
|
||||
w = e.WarnErr
|
||||
x = e.ExitErr
|
||||
|
||||
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`)
|
||||
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]\.[0-5][0-9]$`)
|
||||
)
|
||||
|
||||
func main() {
|
||||
@@ -31,16 +33,16 @@ func main() {
|
||||
switch len(os.Args) {
|
||||
case 1:
|
||||
year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
|
||||
ds = fmt.Sprintf("%d-%02d", year, week)
|
||||
ds = fmt.Sprintf("%d.%02d", year, week)
|
||||
case 2:
|
||||
if !reYearWeek.MatchString(os.Args[1]) {
|
||||
x(fmt.Errorf("arg must be yyyy-ww"))
|
||||
x(fmt.Errorf("arg must be yyyy.ww"))
|
||||
}
|
||||
ds = os.Args[1]
|
||||
default:
|
||||
x(fmt.Errorf("too many arguments"))
|
||||
}
|
||||
dp := ds[:4] + "/" + ds[5:]
|
||||
dp := ds[:4] + "/w" + ds[5:]
|
||||
|
||||
x(os.Chdir("/net/corpora/nlnieuws/LitNL/" + dp))
|
||||
x(os.MkdirAll("out", 0777))
|
||||
@@ -58,10 +60,10 @@ func main() {
|
||||
var item Item
|
||||
x(xml.Unmarshal(b, &item))
|
||||
for _, cat := range item.Cats {
|
||||
x(fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(cat)))
|
||||
x(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(cat)))
|
||||
}
|
||||
x(fp.WriteString(addEnd(fixSpace(item.Title))))
|
||||
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
|
||||
x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
|
||||
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + u.HtmlFixString(item.Text) + `</body></html>`))
|
||||
x(err)
|
||||
root := doc.Root()
|
||||
pp, err := root.Search(`//body//p`)
|
||||
@@ -74,32 +76,8 @@ func main() {
|
||||
_ = w(fmt.Errorf("empty: %s", filename))
|
||||
}
|
||||
for _, p := range pp {
|
||||
x(fp.WriteString(addEnd(fixSpace(p.Content()))))
|
||||
x(fp.WriteString(u.AddEnd(u.FixSpace(p.Content()))))
|
||||
}
|
||||
x(fp.Close())
|
||||
}
|
||||
}
|
||||
|
||||
func addEnd(s string) string {
|
||||
s = strings.TrimSpace(s)
|
||||
n := len(s)
|
||||
if n == 0 {
|
||||
return ""
|
||||
}
|
||||
if n > 0 {
|
||||
if strings.ContainsAny(s[n-1:], ".!?") {
|
||||
return s + "\n"
|
||||
}
|
||||
}
|
||||
if n > 1 {
|
||||
s2 := s[n-2:]
|
||||
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
|
||||
return s + "\n"
|
||||
}
|
||||
}
|
||||
return s + ".\n"
|
||||
}
|
||||
|
||||
func fixSpace(s string) string {
|
||||
return strings.Join(strings.Fields(s), " ")
|
||||
}
|
||||
|
||||
@@ -2,17 +2,20 @@
|
||||
|
||||
set -e
|
||||
|
||||
BASE=/net/corpora/nlnieuws
|
||||
PART=$BASE/LitNL
|
||||
|
||||
unset CDPATH
|
||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
||||
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
|
||||
export TZ=Europe/Amsterdam
|
||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||
|
||||
if [ "$1" = "" ]
|
||||
then
|
||||
ds=`date -d -7days +%G-%V`
|
||||
ds=`date -d -7days +%G.%V`
|
||||
else
|
||||
case "$1" in
|
||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
||||
2[0-9][0-9][0-9].[0-5][0-9])
|
||||
ds=$1
|
||||
;;
|
||||
*)
|
||||
@@ -22,11 +25,13 @@ else
|
||||
esac
|
||||
fi
|
||||
|
||||
dp=${ds//-//}
|
||||
year=${ds%.*}
|
||||
week=${ds#*.}
|
||||
dp=$year/w$week
|
||||
corpus=$PART/corpus/$year/$ds
|
||||
mkdir -p $PART/corpus/$year
|
||||
|
||||
corpus=/net/corpora/nlnieuws/LitNL/corpus/$ds
|
||||
|
||||
cd /net/corpora/nlnieuws/LitNL/$dp
|
||||
cd $PART/$dp
|
||||
|
||||
ln -s lock.$$ lock
|
||||
if [ "`readlink lock`" != lock.$$ ]
|
||||
@@ -38,7 +43,7 @@ fi
|
||||
rm -fr out
|
||||
mkdir out
|
||||
|
||||
../../xml2txt $ds
|
||||
xml2txt $ds
|
||||
|
||||
rm -f $corpus.lines
|
||||
for i in out/*.txt
|
||||
@@ -53,14 +58,14 @@ cd out
|
||||
mkdir xml
|
||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||
|
||||
../../../metadata
|
||||
metadata
|
||||
|
||||
cd xml
|
||||
rm -f $corpus.data.dz $corpus.index
|
||||
alto -q -o $corpus.data.dz *.xml
|
||||
|
||||
# telling per bericht, niet per zin
|
||||
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
|
||||
cd ../..
|
||||
rm -fr out
|
||||
|
||||
10
Makefile
10
Makefile
@@ -4,6 +4,7 @@ all:
|
||||
make -C BuurtAdam
|
||||
make -C BuurtGrn
|
||||
make -C GG
|
||||
make -C HLN
|
||||
make -C LitNL
|
||||
make -C NieuwsNL
|
||||
make -C NOS
|
||||
@@ -15,13 +16,15 @@ all:
|
||||
make -C Sargasso
|
||||
make -C Sikkom
|
||||
make -C Tzum
|
||||
make -C Volkskrant
|
||||
make -C VRT
|
||||
make bin/data2json
|
||||
make bin/dates2json
|
||||
make bin/flush
|
||||
make bin/items2count
|
||||
make bin/score
|
||||
make bin/rang
|
||||
make bin/top20
|
||||
make bin/trends
|
||||
make bin/week2files
|
||||
|
||||
bin/data2json: cmd/data2json/*.go
|
||||
@@ -36,12 +39,15 @@ bin/flush: cmd/flush/*.go
|
||||
bin/items2count: cmd/items2count/*.go
|
||||
go build -o $@ $^
|
||||
|
||||
bin/score: cmd/score/*.go
|
||||
bin/rang: cmd/rang/*.go
|
||||
go build -o $@ $^
|
||||
|
||||
bin/top20: cmd/top20/*.go
|
||||
go build -o $@ $^
|
||||
|
||||
bin/trends: cmd/trends/*.go
|
||||
go build -o $@ $^
|
||||
|
||||
bin/week2files: cmd/week2files/*.go
|
||||
go build -o $@ $^
|
||||
|
||||
|
||||
@@ -3,11 +3,11 @@ all: \
|
||||
metadata \
|
||||
nos
|
||||
|
||||
json2txt: cmd/json2txt/*.go
|
||||
go build -o $@ $^
|
||||
json2txt: cmd/json2txt/*.go ../internal/util/*.go
|
||||
go build -o $@ $<
|
||||
|
||||
metadata: cmd/metadata/*.go
|
||||
go build -o $@ $^
|
||||
|
||||
nos: cmd/nos/*.go
|
||||
go build -o $@ $^
|
||||
nos: cmd/nos/*.go ../internal/util/*.go
|
||||
go build -o $@ $<
|
||||
|
||||
@@ -3,6 +3,8 @@ package main
|
||||
import (
|
||||
e "codeberg.org/pebbe/errors"
|
||||
|
||||
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
|
||||
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
@@ -26,7 +28,7 @@ type Item struct {
|
||||
var (
|
||||
x = e.ExitErr
|
||||
|
||||
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`)
|
||||
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9]$`)
|
||||
)
|
||||
|
||||
func main() {
|
||||
@@ -34,17 +36,17 @@ func main() {
|
||||
var ds string
|
||||
switch len(os.Args) {
|
||||
case 1:
|
||||
year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
|
||||
ds = fmt.Sprintf("%d-%02d", year, week)
|
||||
t := time.Now().AddDate(0, 0, -2)
|
||||
ds = fmt.Sprintf("%d-%02d-%02d", t.Year(), int(t.Month()), t.Day())
|
||||
case 2:
|
||||
if !reYearWeek.MatchString(os.Args[1]) {
|
||||
x(fmt.Errorf("arg must be yyyy-ww"))
|
||||
x(fmt.Errorf("arg must be yyyy-mm-dd"))
|
||||
}
|
||||
ds = os.Args[1]
|
||||
default:
|
||||
x(fmt.Errorf("too many arguments"))
|
||||
}
|
||||
dp := ds[:4] + "/" + ds[5:]
|
||||
dp := strings.ReplaceAll(ds, "-", "/")
|
||||
|
||||
x(os.Chdir("/net/corpora/nlnieuws/NOS/" + dp))
|
||||
x(os.MkdirAll("out", 0777))
|
||||
@@ -61,13 +63,15 @@ func main() {
|
||||
x(err)
|
||||
item := getItem(b, filename)
|
||||
for _, cat := range item.Cats {
|
||||
x(fmt.Fprintf(fp, "##META text cat = %s\n", fixSpace(cat)))
|
||||
x(fmt.Fprintf(fp, "##META text cat = %s\n", u.FixSpace(cat)))
|
||||
}
|
||||
for _, tag := range item.Tags {
|
||||
x(fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(tag)))
|
||||
x(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(tag)))
|
||||
}
|
||||
x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
|
||||
for _, line := range strings.SplitAfter(item.Text, "\n") {
|
||||
x(fp.WriteString(u.AddEnd(u.FixSpace(line, true))))
|
||||
}
|
||||
x(fp.WriteString(addEnd(fixSpace(item.Title))))
|
||||
x(fp.WriteString(fixSpace(item.Text)))
|
||||
x(fp.Close())
|
||||
}
|
||||
}
|
||||
@@ -90,27 +94,3 @@ func getItem(b []byte, filename string) Item {
|
||||
x(json.Unmarshal(b, &item), filename)
|
||||
return item
|
||||
}
|
||||
|
||||
func addEnd(s string) string {
|
||||
s = strings.TrimSpace(s)
|
||||
n := len(s)
|
||||
if n == 0 {
|
||||
return ""
|
||||
}
|
||||
if n > 0 {
|
||||
if strings.ContainsAny(s[n-1:], ".!?") {
|
||||
return s + "\n"
|
||||
}
|
||||
}
|
||||
if n > 1 {
|
||||
s2 := s[n-2:]
|
||||
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
|
||||
return s + "\n"
|
||||
}
|
||||
}
|
||||
return s + ".\n"
|
||||
}
|
||||
|
||||
func fixSpace(s string) string {
|
||||
return strings.Join(strings.Fields(s), " ")
|
||||
}
|
||||
|
||||
@@ -1,17 +1,17 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"html"
|
||||
|
||||
e "codeberg.org/pebbe/errors"
|
||||
|
||||
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
|
||||
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"html"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
@@ -63,7 +63,7 @@ func main() {
|
||||
}()
|
||||
|
||||
myLock := "/net/corpora/nlnieuws/NOS/lock"
|
||||
mkLock(myLock)
|
||||
u.MkLock(myLock)
|
||||
defer func() {
|
||||
_ = os.Remove(myLock)
|
||||
}()
|
||||
@@ -94,8 +94,7 @@ func main() {
|
||||
}
|
||||
}
|
||||
p(err)
|
||||
year, week := t.ISOWeek()
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/NOS/%d/%02d", year, week)
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/NOS/%d/%02d/%02d", t.Year(), int(t.Month()), t.Day())
|
||||
if exists(dirname + "/lock") {
|
||||
continue
|
||||
}
|
||||
@@ -195,16 +194,3 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
|
||||
p(os.Chtimes(filename+".json", timestamp, timestamp))
|
||||
return true
|
||||
}
|
||||
|
||||
func mkLock(filename string) {
|
||||
pid := os.Getpid()
|
||||
link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
|
||||
p(os.Symlink(link, filename))
|
||||
|
||||
name, err := os.Readlink(filename)
|
||||
p(err)
|
||||
|
||||
if name != link {
|
||||
p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,17 +2,20 @@
|
||||
|
||||
set -e
|
||||
|
||||
BASE=/net/corpora/nlnieuws
|
||||
PART=$BASE/NOS
|
||||
|
||||
unset CDPATH
|
||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
||||
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
|
||||
export TZ=Europe/Amsterdam
|
||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||
|
||||
if [ "$1" = "" ]
|
||||
then
|
||||
ds=`date -d -7days +%G-%V`
|
||||
ds=`date -d -2days +%Y-%m-%d`
|
||||
else
|
||||
case "$1" in
|
||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
||||
2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9])
|
||||
ds=$1
|
||||
;;
|
||||
*)
|
||||
@@ -23,10 +26,11 @@ else
|
||||
fi
|
||||
|
||||
dp=${ds//-//}
|
||||
year=${ds%%-*}
|
||||
corpus=$PART/corpus/$year/$ds
|
||||
mkdir -p $PART/corpus/$year
|
||||
|
||||
corpus=/net/corpora/nlnieuws/NOS/corpus/$ds
|
||||
|
||||
cd /net/corpora/nlnieuws/NOS/$dp
|
||||
cd $PART/$dp
|
||||
|
||||
ln -s lock.$$ lock
|
||||
if [ "`readlink lock`" != lock.$$ ]
|
||||
@@ -38,7 +42,7 @@ fi
|
||||
rm -fr out
|
||||
mkdir out
|
||||
|
||||
../../json2txt $ds
|
||||
json2txt $ds
|
||||
|
||||
rm -f $corpus.lines
|
||||
for i in out/*.txt
|
||||
@@ -53,15 +57,15 @@ cd out
|
||||
mkdir xml
|
||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||
|
||||
../../../metadata
|
||||
metadata
|
||||
|
||||
cd xml
|
||||
rm -f $corpus.data.dz $corpus.index
|
||||
alto -q -o $corpus.data.dz *.xml
|
||||
|
||||
# telling per bericht, niet per zin
|
||||
/net/corpora/nlnieuws/namen.sh -x C -s $corpus.data.dz > $corpus.cat.txt
|
||||
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
query.sh -x C -s $corpus.data.dz > $corpus.cat.txt
|
||||
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
|
||||
cd ../..
|
||||
rm -fr out
|
||||
|
||||
@@ -5,5 +5,5 @@ all: \
|
||||
metadata: cmd/metadata/*.go
|
||||
go build -o $@ $^
|
||||
|
||||
nu: cmd/nu/*.go
|
||||
go build -o $@ $^
|
||||
nu: cmd/nu/*.go ../internal/util/*.go
|
||||
go build -o $@ $<
|
||||
|
||||
@@ -3,6 +3,8 @@ package main
|
||||
import (
|
||||
e "codeberg.org/pebbe/errors"
|
||||
|
||||
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
|
||||
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"encoding/xml"
|
||||
@@ -12,7 +14,6 @@ import (
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
@@ -73,7 +74,7 @@ func main() {
|
||||
}()
|
||||
|
||||
myLock := "/net/corpora/nlnieuws/NU/lock"
|
||||
mkLock(myLock)
|
||||
u.MkLock(myLock)
|
||||
defer func() {
|
||||
_ = os.Remove(myLock)
|
||||
}()
|
||||
@@ -97,8 +98,7 @@ func main() {
|
||||
t, err = time.Parse(time.RFC1123, item.PubDate)
|
||||
}
|
||||
p(err)
|
||||
year, week := t.ISOWeek()
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/NU/%d/%02d", year, week)
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/NU/%d/%02d/%02d", t.Year(), int(t.Month()), t.Day())
|
||||
if exists(dirname + "/lock") {
|
||||
continue
|
||||
}
|
||||
@@ -161,6 +161,8 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
|
||||
p(err)
|
||||
p(resp.Body.Close())
|
||||
|
||||
body = u.HtmlFix(body)
|
||||
|
||||
s := string(body)
|
||||
ok := true
|
||||
i1 := strings.Index(s, `<script type="application/ld+json"`)
|
||||
@@ -226,22 +228,9 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
|
||||
// text bevat kopjes zonder punt aan het eind
|
||||
lines := strings.Split(text, "\n")
|
||||
for i, line := range lines {
|
||||
line = fixSpace(line)
|
||||
n := len(line)
|
||||
if n > 0 {
|
||||
if strings.ContainsAny(line[n-1:], ".!?") {
|
||||
continue
|
||||
}
|
||||
}
|
||||
if n > 1 {
|
||||
s := line[n-2:]
|
||||
if s == `."` || s == `!"` || s == `?"` {
|
||||
continue
|
||||
}
|
||||
}
|
||||
lines[i] = line + "."
|
||||
lines[i] = u.AddEnd(u.FixSpace(line, true))
|
||||
}
|
||||
text = strings.Join(lines, "\n") + "\n"
|
||||
text = strings.Join(lines, "") + "\n"
|
||||
|
||||
fp, err := os.Create(filename + ".txt")
|
||||
p(err)
|
||||
@@ -249,7 +238,7 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
|
||||
p(fmt.Fprintln(fp, "##META text tag ="))
|
||||
} else {
|
||||
for _, tag := range tags {
|
||||
p(fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(tag)))
|
||||
p(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(tag)))
|
||||
}
|
||||
}
|
||||
p(fp.WriteString(text))
|
||||
@@ -259,20 +248,3 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
func fixSpace(s string) string {
|
||||
return strings.Join(strings.Fields(s), " ")
|
||||
}
|
||||
|
||||
func mkLock(filename string) {
|
||||
pid := os.Getpid()
|
||||
link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
|
||||
p(os.Symlink(link, filename))
|
||||
|
||||
name, err := os.Readlink(filename)
|
||||
p(err)
|
||||
|
||||
if name != link {
|
||||
p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,17 +2,20 @@
|
||||
|
||||
set -e
|
||||
|
||||
BASE=/net/corpora/nlnieuws
|
||||
PART=$BASE/NU
|
||||
|
||||
unset CDPATH
|
||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
||||
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
|
||||
export TZ=Europe/Amsterdam
|
||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||
|
||||
if [ "$1" = "" ]
|
||||
then
|
||||
ds=`date -d -7days +%G-%V`
|
||||
ds=`date -d -2days +%Y-%m-%d`
|
||||
else
|
||||
case "$1" in
|
||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
||||
2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9])
|
||||
ds=$1
|
||||
;;
|
||||
*)
|
||||
@@ -23,10 +26,11 @@ else
|
||||
fi
|
||||
|
||||
dp=${ds//-//}
|
||||
year=${ds%%-*}
|
||||
corpus=$PART/corpus/$year/$ds
|
||||
mkdir -p $PART/corpus/$year
|
||||
|
||||
corpus=/net/corpora/nlnieuws/NU/corpus/$ds
|
||||
|
||||
cd /net/corpora/nlnieuws/NU/$dp
|
||||
cd $PART/$dp
|
||||
|
||||
ln -s lock.$$ lock
|
||||
if [ "`readlink lock`" != lock.$$ ]
|
||||
@@ -51,14 +55,14 @@ cd out
|
||||
mkdir xml
|
||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||
|
||||
../../../metadata
|
||||
metadata
|
||||
|
||||
cd xml
|
||||
rm -f $corpus.data.dz $corpus.index
|
||||
alto -q -o $corpus.data.dz *.xml
|
||||
|
||||
# telling per bericht, niet per zin
|
||||
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
|
||||
cd ../..
|
||||
rm -fr out
|
||||
|
||||
@@ -5,5 +5,5 @@ all: \
|
||||
metadata: cmd/metadata/*.go
|
||||
go build -o $@ $^
|
||||
|
||||
nieuwsnl: cmd/nieuwsnl/*.go
|
||||
go build -o $@ $^
|
||||
nieuwsnl: cmd/nieuwsnl/*.go ../internal/util/*.go
|
||||
go build -o $@ $<
|
||||
|
||||
@@ -4,6 +4,8 @@ import (
|
||||
e "codeberg.org/pebbe/errors"
|
||||
"github.com/jbowtie/gokogiri"
|
||||
|
||||
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
|
||||
|
||||
"bytes"
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
@@ -11,7 +13,6 @@ import (
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
@@ -64,7 +65,7 @@ func main() {
|
||||
}()
|
||||
|
||||
myLock := "/net/corpora/nlnieuws/NieuwsNL/lock"
|
||||
mkLock(myLock)
|
||||
u.MkLock(myLock)
|
||||
defer func() {
|
||||
_ = os.Remove(myLock)
|
||||
}()
|
||||
@@ -153,6 +154,8 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
|
||||
p(err)
|
||||
p(resp.Body.Close())
|
||||
|
||||
body = u.HtmlFix(body)
|
||||
|
||||
doc, err := gokogiri.ParseHtml(body)
|
||||
p(err)
|
||||
|
||||
@@ -173,11 +176,11 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
|
||||
} else {
|
||||
for _, a := range aa {
|
||||
tag = strings.ReplaceAll(a.Content(), "\n", " ")
|
||||
p(fmt.Fprintf(&buf, "##META text tag = %s\n", fixSpace(tag)))
|
||||
p(fmt.Fprintf(&buf, "##META text tag = %s\n", u.FixSpace(tag)))
|
||||
}
|
||||
}
|
||||
|
||||
p(buf.WriteString(addEnd(fixSpace(title))))
|
||||
p(buf.WriteString(u.AddEnd(u.FixSpace(title))))
|
||||
|
||||
// oud: //div[@id="article-blocks"]//p
|
||||
pp, err := root.Search(`//div[@id="article-blocks"]//div[contains(@class, "paragraph-content")]`)
|
||||
@@ -204,7 +207,7 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
|
||||
return false // echt fout
|
||||
}
|
||||
for _, p1 := range pp {
|
||||
p(buf.WriteString(addEnd(fixSpace(p1.Content()))))
|
||||
p(buf.WriteString(u.AddEnd(u.FixSpace(p1.Content()))))
|
||||
}
|
||||
|
||||
fp, err := os.Create(filename + ".txt")
|
||||
@@ -216,40 +219,3 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
func addEnd(s string) string {
|
||||
s = strings.TrimSpace(s)
|
||||
n := len(s)
|
||||
if n == 0 {
|
||||
return ""
|
||||
}
|
||||
if n > 0 {
|
||||
if strings.ContainsAny(s[n-1:], ".!?") {
|
||||
return s + "\n"
|
||||
}
|
||||
}
|
||||
if n > 1 {
|
||||
s2 := s[n-2:]
|
||||
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
|
||||
return s + "\n"
|
||||
}
|
||||
}
|
||||
return s + ".\n"
|
||||
}
|
||||
|
||||
func fixSpace(s string) string {
|
||||
return strings.Join(strings.Fields(s), " ")
|
||||
}
|
||||
|
||||
func mkLock(filename string) {
|
||||
pid := os.Getpid()
|
||||
link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
|
||||
p(os.Symlink(link, filename))
|
||||
|
||||
name, err := os.Readlink(filename)
|
||||
p(err)
|
||||
|
||||
if name != link {
|
||||
p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,8 +2,11 @@
|
||||
|
||||
set -e
|
||||
|
||||
BASE=/net/corpora/nlnieuws
|
||||
PART=$BASE/NieuwsNL
|
||||
|
||||
unset CDPATH
|
||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
||||
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
|
||||
export TZ=Europe/Amsterdam
|
||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||
|
||||
@@ -25,10 +28,11 @@ else
|
||||
fi
|
||||
|
||||
dp=${ds//-//}
|
||||
year=${ds%%-*}
|
||||
corpus=$PART/corpus/$year/$ds
|
||||
mkdir -p $PART/corpus/$year
|
||||
|
||||
corpus=/net/corpora/nlnieuws/NieuwsNL/corpus/$ds
|
||||
|
||||
cd /net/corpora/nlnieuws/NieuwsNL/$dp
|
||||
cd $PART/$dp
|
||||
|
||||
ln -s lock.$$ lock
|
||||
if [ "`readlink lock`" != lock.$$ ]
|
||||
@@ -53,14 +57,14 @@ cd out
|
||||
mkdir xml
|
||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||
|
||||
../../../../metadata
|
||||
metadata
|
||||
|
||||
cd xml
|
||||
rm -f $corpus.data.dz $corpus.index
|
||||
alto -q -o $corpus.data.dz *.xml
|
||||
|
||||
# telling per bericht, niet per zin
|
||||
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
|
||||
cd ../..
|
||||
rm -fr out
|
||||
|
||||
@@ -3,11 +3,11 @@ all: \
|
||||
metadata \
|
||||
oog
|
||||
|
||||
xml2txt: cmd/xml2txt/*.go
|
||||
go build -o $@ $^
|
||||
xml2txt: cmd/xml2txt/*.go ../internal/util/*.go
|
||||
go build -o $@ $<
|
||||
|
||||
metadata: cmd/metadata/*.go
|
||||
go build -o $@ $^
|
||||
|
||||
oog: cmd/oog/*.go
|
||||
go build -o $@ $^
|
||||
oog: cmd/oog/*.go ../internal/util/*.go
|
||||
go build -o $@ $<
|
||||
|
||||
@@ -3,13 +3,14 @@ package main
|
||||
import (
|
||||
e "codeberg.org/pebbe/errors"
|
||||
|
||||
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
|
||||
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
@@ -46,7 +47,7 @@ func main() {
|
||||
}()
|
||||
|
||||
myLock := "/net/corpora/nlnieuws/Oog/lock"
|
||||
mkLock(myLock)
|
||||
u.MkLock(myLock)
|
||||
defer func() {
|
||||
_ = os.Remove(myLock)
|
||||
}()
|
||||
@@ -76,7 +77,7 @@ func main() {
|
||||
}
|
||||
p(err)
|
||||
year, week := t.ISOWeek()
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Oog/%d/%02d", year, week)
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Oog/%d/w%02d", year, week)
|
||||
if exists(dirname + "/lock") {
|
||||
continue
|
||||
}
|
||||
@@ -111,16 +112,3 @@ func main() {
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
func mkLock(filename string) {
|
||||
pid := os.Getpid()
|
||||
link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
|
||||
p(os.Symlink(link, filename))
|
||||
|
||||
name, err := os.Readlink(filename)
|
||||
p(err)
|
||||
|
||||
if name != link {
|
||||
p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,6 +4,8 @@ import (
|
||||
e "codeberg.org/pebbe/errors"
|
||||
"github.com/jbowtie/gokogiri"
|
||||
|
||||
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
|
||||
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"os"
|
||||
@@ -21,7 +23,7 @@ type Item struct {
|
||||
var (
|
||||
x = e.ExitErr
|
||||
|
||||
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`)
|
||||
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]\.[0-5][0-9]$`)
|
||||
)
|
||||
|
||||
func main() {
|
||||
@@ -30,16 +32,16 @@ func main() {
|
||||
switch len(os.Args) {
|
||||
case 1:
|
||||
year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
|
||||
ds = fmt.Sprintf("%d-%02d", year, week)
|
||||
ds = fmt.Sprintf("%d.%02d", year, week)
|
||||
case 2:
|
||||
if !reYearWeek.MatchString(os.Args[1]) {
|
||||
x(fmt.Errorf("arg must be yyyy-ww"))
|
||||
x(fmt.Errorf("arg must be yyyy.ww"))
|
||||
}
|
||||
ds = os.Args[1]
|
||||
default:
|
||||
x(fmt.Errorf("too many arguments"))
|
||||
}
|
||||
dp := ds[:4] + "/" + ds[5:]
|
||||
dp := ds[:4] + "/w" + ds[5:]
|
||||
|
||||
x(os.Chdir("/net/corpora/nlnieuws/Oog/" + dp))
|
||||
x(os.MkdirAll("out", 0777))
|
||||
@@ -57,41 +59,20 @@ func main() {
|
||||
var item Item
|
||||
x(xml.Unmarshal(b, &item))
|
||||
for _, cat := range item.Cats {
|
||||
x(fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(cat)))
|
||||
t := u.FixSpace(cat)
|
||||
if t != "Nieuws" {
|
||||
x(fmt.Fprintf(fp, "##META text tag = %s\n", t))
|
||||
}
|
||||
}
|
||||
x(fp.WriteString(addEnd(fixSpace(item.Title))))
|
||||
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
|
||||
x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
|
||||
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + u.HtmlFixString(item.Text) + `</body></html>`))
|
||||
x(err)
|
||||
root := doc.Root()
|
||||
pp, err := root.Search(`//body/p`)
|
||||
x(err)
|
||||
for _, p := range pp {
|
||||
x(fp.WriteString(addEnd(fixSpace(p.Content()))))
|
||||
x(fp.WriteString(u.AddEnd(u.FixSpace(p.Content()))))
|
||||
}
|
||||
x(fp.Close())
|
||||
}
|
||||
}
|
||||
|
||||
func addEnd(s string) string {
|
||||
s = strings.TrimSpace(s)
|
||||
n := len(s)
|
||||
if n == 0 {
|
||||
return ""
|
||||
}
|
||||
if n > 0 {
|
||||
if strings.ContainsAny(s[n-1:], ".!?") {
|
||||
return s + "\n"
|
||||
}
|
||||
}
|
||||
if n > 1 {
|
||||
s2 := s[n-2:]
|
||||
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
|
||||
return s + "\n"
|
||||
}
|
||||
}
|
||||
return s + ".\n"
|
||||
}
|
||||
|
||||
func fixSpace(s string) string {
|
||||
return strings.Join(strings.Fields(s), " ")
|
||||
}
|
||||
|
||||
@@ -2,17 +2,20 @@
|
||||
|
||||
set -e
|
||||
|
||||
BASE=/net/corpora/nlnieuws
|
||||
PART=$BASE/Oog
|
||||
|
||||
unset CDPATH
|
||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
||||
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
|
||||
export TZ=Europe/Amsterdam
|
||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||
|
||||
if [ "$1" = "" ]
|
||||
then
|
||||
ds=`date -d -7days +%G-%V`
|
||||
ds=`date -d -7days +%G.%V`
|
||||
else
|
||||
case "$1" in
|
||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
||||
2[0-9][0-9][0-9].[0-5][0-9])
|
||||
ds=$1
|
||||
;;
|
||||
*)
|
||||
@@ -22,11 +25,13 @@ else
|
||||
esac
|
||||
fi
|
||||
|
||||
dp=${ds//-//}
|
||||
year=${ds%.*}
|
||||
week=${ds#*.}
|
||||
dp=$year/w$week
|
||||
corpus=$PART/corpus/$year/$ds
|
||||
mkdir -p $PART/corpus/$year
|
||||
|
||||
corpus=/net/corpora/nlnieuws/Oog/corpus/$ds
|
||||
|
||||
cd /net/corpora/nlnieuws/Oog/$dp
|
||||
cd $PART/$dp
|
||||
|
||||
ln -s lock.$$ lock
|
||||
if [ "`readlink lock`" != lock.$$ ]
|
||||
@@ -38,7 +43,7 @@ fi
|
||||
rm -fr out
|
||||
mkdir out
|
||||
|
||||
../../xml2txt $ds
|
||||
xml2txt $ds
|
||||
|
||||
rm -f $corpus.lines
|
||||
for i in out/*.txt
|
||||
@@ -53,14 +58,14 @@ cd out
|
||||
mkdir xml
|
||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||
|
||||
../../../metadata
|
||||
metadata
|
||||
|
||||
cd xml
|
||||
rm -f $corpus.data.dz $corpus.index
|
||||
alto -q -o $corpus.data.dz *.xml
|
||||
|
||||
# telling per bericht, niet per zin
|
||||
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
|
||||
cd ../..
|
||||
rm -fr out
|
||||
|
||||
@@ -5,5 +5,5 @@ all: \
|
||||
metadata: cmd/metadata/*.go
|
||||
go build -o $@ $^
|
||||
|
||||
parool: cmd/parool/*.go
|
||||
go build -o $@ $^
|
||||
parool: cmd/parool/*.go ../internal/util/*.go
|
||||
go build -o $@ $<
|
||||
|
||||
@@ -4,13 +4,16 @@ import (
|
||||
e "codeberg.org/pebbe/errors"
|
||||
"github.com/jbowtie/gokogiri"
|
||||
|
||||
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
|
||||
|
||||
//"encoding/json"
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
//"html"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
@@ -25,9 +28,16 @@ type ItemT struct {
|
||||
UnixTime int64 `xml:"unixTime"`
|
||||
Guid string `xml:"guid"`
|
||||
Link string `xml:"link"`
|
||||
Title string `xml:"title"`
|
||||
Data []byte `xml:",innerxml"`
|
||||
}
|
||||
|
||||
/*
|
||||
type GraphT struct {
|
||||
Graph []map[string]any `json:"@graph"`
|
||||
}
|
||||
*/
|
||||
|
||||
var (
|
||||
p = e.PanicErr
|
||||
w = e.WarnErr
|
||||
@@ -62,7 +72,7 @@ func main() {
|
||||
}()
|
||||
|
||||
myLock := "/net/corpora/nlnieuws/Parool/lock"
|
||||
mkLock(myLock)
|
||||
u.MkLock(myLock)
|
||||
defer func() {
|
||||
_ = os.Remove(myLock)
|
||||
}()
|
||||
@@ -92,7 +102,7 @@ func main() {
|
||||
}
|
||||
p(err)
|
||||
year, week := t.ISOWeek()
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Parool/%d/%02d", year, week)
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Parool/%d/w%02d", year, week)
|
||||
if exists(dirname + "/lock") {
|
||||
continue
|
||||
}
|
||||
@@ -122,18 +132,19 @@ func main() {
|
||||
p(fp.WriteString("</item>\n"))
|
||||
p(fp.Close())
|
||||
p(os.Chtimes(filename+".xml", t, t))
|
||||
ok = doArticle(filename, item.Link, t, needUpdate)
|
||||
ok = doArticle(filename, item.Link, item.Title, t, needUpdate)
|
||||
}()
|
||||
}
|
||||
}
|
||||
|
||||
func doArticle(filename string, url string, timestamp time.Time, needUpdate bool) (ok bool) {
|
||||
func doArticle(filename string, url string, title string, timestamp time.Time, needUpdate bool) (ok bool) {
|
||||
if exists(filename + ".skip") {
|
||||
return true
|
||||
}
|
||||
if needUpdate {
|
||||
_ = os.Remove(filename + ".err")
|
||||
_ = os.Remove(filename + ".html")
|
||||
// _ = os.Remove(filename + ".json")
|
||||
_ = os.Remove(filename + ".txt")
|
||||
} else {
|
||||
if exists(filename + ".txt") {
|
||||
@@ -153,9 +164,67 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
|
||||
p(err)
|
||||
p(resp.Body.Close())
|
||||
|
||||
body = u.HtmlFix(body)
|
||||
|
||||
doc, err := gokogiri.ParseHtml(body)
|
||||
p(err)
|
||||
|
||||
/*
|
||||
|
||||
s := string(body)
|
||||
|
||||
ok = true
|
||||
i1 := strings.Index(s, `<script type="application/ld+json"`)
|
||||
if i1 < 0 {
|
||||
ok = false
|
||||
} else {
|
||||
i1 += strings.Index(s[i1:], `>`) + 1
|
||||
i2 := i1 + strings.Index(s[i1:], `</script>`)
|
||||
if i2 < i1 {
|
||||
ok = false
|
||||
} else {
|
||||
s = html.UnescapeString(s[i1:i2])
|
||||
}
|
||||
}
|
||||
if !ok {
|
||||
_ = w(fmt.Errorf("script jsonld not found: %s", url))
|
||||
|
||||
fp, err := os.Create(filename + ".err")
|
||||
p(err)
|
||||
p(fmt.Fprintf(fp, "script jsonld not found: %s\n", url))
|
||||
p(fp.Close())
|
||||
p(os.Chtimes(filename+".err", timestamp, timestamp))
|
||||
|
||||
fp, err = os.Create(filename + ".html")
|
||||
p(err)
|
||||
p(fp.Write(body))
|
||||
p(fp.Close())
|
||||
p(os.Chtimes(filename+".html", timestamp, timestamp))
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
var graph GraphT
|
||||
p(json.Unmarshal([]byte(s), &graph))
|
||||
for _, g := range graph.Graph {
|
||||
t := g["@type"]
|
||||
switch v := t.(type) {
|
||||
case string:
|
||||
if v == "NewsArticle" {
|
||||
b, err := json.Marshal(g)
|
||||
p(err)
|
||||
s = string(b)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fp, err := os.Create(filename + ".json")
|
||||
p(err)
|
||||
p(fp.WriteString(s))
|
||||
p(fp.Close())
|
||||
p(os.Chtimes(filename+".json", timestamp, timestamp))
|
||||
*/
|
||||
|
||||
root := doc.Root()
|
||||
|
||||
articles, err := root.Search(`//article[@id="article-content"]`)
|
||||
@@ -179,70 +248,85 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
|
||||
}
|
||||
article := articles[0]
|
||||
|
||||
tags := make([]string, 0)
|
||||
ell, err := article.Search(`//header//*[@data-test-id="article-label"]`)
|
||||
live, err := article.Search(`.//*[@data-test-id="live-blog-label"]`)
|
||||
p(err)
|
||||
for _, el := range ell {
|
||||
s := strings.TrimSpace(el.Content())
|
||||
if s != "" {
|
||||
tags = append(tags, s)
|
||||
}
|
||||
if len(live) > 0 {
|
||||
fp, err := os.Create(filename + ".skip")
|
||||
p(fp.WriteString("liveblog\n"))
|
||||
p(err)
|
||||
p(os.Chtimes(filename+".skip", timestamp, timestamp))
|
||||
return true
|
||||
}
|
||||
|
||||
fouten := make([]string, 0)
|
||||
pars := make([]string, 0)
|
||||
|
||||
ell, err = article.Search(`//header//*[@data-test-id="article-title"]`)
|
||||
headers, err := article.Search(`.//header`)
|
||||
p(err)
|
||||
for _, el := range ell {
|
||||
s := strings.TrimSpace(el.Content())
|
||||
if s != "" {
|
||||
pars = append(pars, s)
|
||||
}
|
||||
}
|
||||
if len(headers) == 0 {
|
||||
_ = w(fmt.Errorf("no header: %s", url))
|
||||
|
||||
found := false
|
||||
ell, err = article.Search(`//header//*[@data-test-id="header-intro"]`)
|
||||
p(err)
|
||||
for _, el := range ell {
|
||||
s := strings.TrimSpace(el.Content())
|
||||
if s != "" {
|
||||
pars = append(pars, s)
|
||||
found = true
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
fouten = append(fouten, fmt.Sprintf("no heading: %s\n", url))
|
||||
_ = w(fmt.Errorf("no heading: %s", url))
|
||||
}
|
||||
|
||||
specials, err := article.Search(`//section//aside | //section//figure | //section//b`)
|
||||
p(err)
|
||||
for _, special := range specials {
|
||||
special.Remove()
|
||||
}
|
||||
|
||||
found = false
|
||||
ell, err = article.Search(`//section//*[@data-article-element-index]`)
|
||||
p(err)
|
||||
for _, el := range ell {
|
||||
s := strings.TrimSpace(el.Content())
|
||||
if s != "" {
|
||||
pars = append(pars, s)
|
||||
found = true
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
fouten = append(fouten, fmt.Sprintf("no text: %s\n", url))
|
||||
_ = w(fmt.Errorf("no text: %s", url))
|
||||
}
|
||||
|
||||
if len(fouten) > 0 {
|
||||
fp, err := os.Create(filename + ".err")
|
||||
p(err)
|
||||
for _, fout := range fouten {
|
||||
p(fp.WriteString(fout))
|
||||
p(fmt.Fprintf(fp, "no elements: %s\n", url))
|
||||
p(fp.Close())
|
||||
p(os.Chtimes(filename+".err", timestamp, timestamp))
|
||||
|
||||
fp, err = os.Create(filename + ".html")
|
||||
p(err)
|
||||
p(fp.Write(body))
|
||||
p(fp.Close())
|
||||
p(os.Chtimes(filename+".html", timestamp, timestamp))
|
||||
|
||||
return false
|
||||
|
||||
}
|
||||
header := headers[0]
|
||||
|
||||
isVideo := false
|
||||
tags := make([]string, 0)
|
||||
ell, err := header.Search(`.//*[@data-test-id="article-label"]`)
|
||||
p(err)
|
||||
if len(ell) == 0 {
|
||||
_ = w(fmt.Errorf("no labels: %s", url))
|
||||
}
|
||||
for _, el := range ell {
|
||||
s := strings.TrimSpace(el.Content())
|
||||
if s != "" && s != "Nieuws" {
|
||||
tags = append(tags, s)
|
||||
}
|
||||
if strings.ToLower(s) == "video" {
|
||||
isVideo = true
|
||||
}
|
||||
}
|
||||
|
||||
pars := make([]string, 0)
|
||||
|
||||
found := false
|
||||
ell, err = header.Search(`.//*[@data-test-id="header-intro"]`)
|
||||
p(err)
|
||||
for _, el := range ell {
|
||||
s := strings.TrimSpace(el.Content())
|
||||
if s != "" {
|
||||
pars = append(pars, s)
|
||||
found = true
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
_ = w(fmt.Errorf("no intro: %s", url))
|
||||
}
|
||||
|
||||
specials, err := article.Search(`.//aside | .//figure | .//figcaption | .//section//b`)
|
||||
p(err)
|
||||
for i := len(specials) - 1; i >= 0; i-- {
|
||||
specials[i].Remove()
|
||||
}
|
||||
|
||||
ell, err = article.Search(`.//section//*[@data-article-element-index]`)
|
||||
p(err)
|
||||
if len(ell) == 0 {
|
||||
_ = w(fmt.Errorf("no elements: %s", url))
|
||||
|
||||
fp, err := os.Create(filename + ".err")
|
||||
p(err)
|
||||
p(fmt.Fprintf(fp, "no elements: %s\n", url))
|
||||
p(fp.Close())
|
||||
p(os.Chtimes(filename+".err", timestamp, timestamp))
|
||||
|
||||
@@ -255,6 +339,32 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
|
||||
return false
|
||||
}
|
||||
|
||||
found = false
|
||||
for _, el := range ell {
|
||||
s := strings.TrimSpace(el.Content())
|
||||
if s != "" {
|
||||
pars = append(pars, s)
|
||||
found = true
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
if !isVideo {
|
||||
_ = w(fmt.Errorf("no text, skipping: %s", url))
|
||||
}
|
||||
fp, err := os.Create(filename + ".skip")
|
||||
p(fp.WriteString(url + "\n"))
|
||||
p(err)
|
||||
p(os.Chtimes(filename+".skip", timestamp, timestamp))
|
||||
|
||||
fp, err = os.Create(filename + ".html")
|
||||
p(err)
|
||||
p(fp.Write(body))
|
||||
p(fp.Close())
|
||||
p(os.Chtimes(filename+".html", timestamp, timestamp))
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
fp, err := os.Create(filename + ".txt")
|
||||
p(err)
|
||||
|
||||
@@ -262,12 +372,14 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
|
||||
p(fmt.Fprintln(fp, "##META text tag ="))
|
||||
} else {
|
||||
for _, tag := range tags {
|
||||
p(fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(tag)))
|
||||
p(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(tag)))
|
||||
}
|
||||
}
|
||||
|
||||
p(fp.WriteString(u.AddEnd(u.FixSpace(title))))
|
||||
|
||||
for _, par := range pars {
|
||||
p(fp.WriteString(addEnd(fixSpace(par))))
|
||||
p(fp.WriteString(u.AddEnd(u.FixSpace(par))))
|
||||
}
|
||||
|
||||
p(fp.Close())
|
||||
@@ -276,43 +388,3 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
func addEnd(s string) string {
|
||||
s = strings.TrimSpace(s)
|
||||
n := len(s)
|
||||
if n == 0 {
|
||||
return ""
|
||||
}
|
||||
if n > 0 {
|
||||
if strings.ContainsAny(s[n-1:], ".!?") {
|
||||
return s + "\n"
|
||||
}
|
||||
}
|
||||
if n > 1 {
|
||||
s2 := s[n-2:]
|
||||
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
|
||||
return s + "\n"
|
||||
}
|
||||
}
|
||||
if strings.HasSuffix(s, `.”`) || strings.HasSuffix(s, `!”`) || strings.HasSuffix(s, `?”`) {
|
||||
return s + "\n"
|
||||
}
|
||||
return s + ".\n"
|
||||
}
|
||||
|
||||
func fixSpace(s string) string {
|
||||
return strings.Join(strings.Fields(s), " ")
|
||||
}
|
||||
|
||||
func mkLock(filename string) {
|
||||
pid := os.Getpid()
|
||||
link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
|
||||
p(os.Symlink(link, filename))
|
||||
|
||||
name, err := os.Readlink(filename)
|
||||
p(err)
|
||||
|
||||
if name != link {
|
||||
p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,17 +2,20 @@
|
||||
|
||||
set -e
|
||||
|
||||
BASE=/net/corpora/nlnieuws
|
||||
PART=$BASE/Parool
|
||||
|
||||
unset CDPATH
|
||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
||||
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
|
||||
export TZ=Europe/Amsterdam
|
||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||
|
||||
if [ "$1" = "" ]
|
||||
then
|
||||
ds=`date -d -7days +%G-%V`
|
||||
ds=`date -d -7days +%G.%V`
|
||||
else
|
||||
case "$1" in
|
||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
||||
2[0-9][0-9][0-9].[0-5][0-9])
|
||||
ds=$1
|
||||
;;
|
||||
*)
|
||||
@@ -22,11 +25,13 @@ else
|
||||
esac
|
||||
fi
|
||||
|
||||
dp=${ds//-//}
|
||||
year=${ds%.*}
|
||||
week=${ds#*.}
|
||||
dp=$year/w$week
|
||||
corpus=$PART/corpus/$year/$ds
|
||||
mkdir -p $PART/corpus/$year
|
||||
|
||||
corpus=/net/corpora/nlnieuws/Parool/corpus/$ds
|
||||
|
||||
cd /net/corpora/nlnieuws/Parool/$dp
|
||||
cd $PART/$dp
|
||||
|
||||
ln -s lock.$$ lock
|
||||
if [ "`readlink lock`" != lock.$$ ]
|
||||
@@ -51,14 +56,14 @@ cd out
|
||||
mkdir xml
|
||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||
|
||||
../../../metadata
|
||||
metadata
|
||||
|
||||
cd xml
|
||||
rm -f $corpus.data.dz $corpus.index
|
||||
alto -q -o $corpus.data.dz *.xml
|
||||
|
||||
# telling per bericht, niet per zin
|
||||
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
|
||||
cd ../..
|
||||
rm -fr out
|
||||
|
||||
95
README.md
Normal file
95
README.md
Normal file
@@ -0,0 +1,95 @@
|
||||
# Actuele nieuwsberichten
|
||||
|
||||
- voor Alpino: nieuwe namen en nieuwe woorden
|
||||
- voor *Woord van de maand* ook: toplijsten van personen, plaatsen,
|
||||
organisaties en andere namen (TODO: url van webapp)
|
||||
|
||||
Voor interactief gebruik:
|
||||
|
||||
```
|
||||
query.sh
|
||||
```
|
||||
|
||||
## 1. Verzamelen van berichten
|
||||
|
||||
Berichten van NieuwsNL in `NieuwsNL/yyyy/mm/dd/`
|
||||
|
||||
Overigen in `[A-Z]*/yyyy/ww/` (weeknummer)
|
||||
|
||||
crontab van p209327@colossus
|
||||
|
||||
```
|
||||
# m h dom mon dow command
|
||||
3 * * * * /net/corpora/nlnieuws/AT5/at5
|
||||
4 * * * * /net/corpora/nlnieuws/BuurtAdam/buurtadam
|
||||
5 * * * * /net/corpora/nlnieuws/BuurtGrn/buurtgrn
|
||||
6 * * * * /net/corpora/nlnieuws/GG/gg
|
||||
7 * * * * /net/corpora/nlnieuws/HLN/hln
|
||||
8 * * * * /net/corpora/nlnieuws/LitNL/litnl
|
||||
9 * * * * /net/corpora/nlnieuws/NieuwsNL/nieuwsnl
|
||||
10 * * * * /net/corpora/nlnieuws/NOS/nos
|
||||
11 * * * * /net/corpora/nlnieuws/NU/nu
|
||||
12 * * * * /net/corpora/nlnieuws/Oog/oog
|
||||
13 * * * * /net/corpora/nlnieuws/Parool/parool
|
||||
14 * * * * /net/corpora/nlnieuws/RO/ro
|
||||
15 * * * * /net/corpora/nlnieuws/RTVNoord/rtvnoord
|
||||
16 * * * * /net/corpora/nlnieuws/Sargasso/sargasso
|
||||
17 * * * * /net/corpora/nlnieuws/Sikkom/sikkom
|
||||
18 * * * * /net/corpora/nlnieuws/Tzum/tzum
|
||||
19 * * * * /net/corpora/nlnieuws/VRT/vrt
|
||||
20 * * * * /net/corpora/nlnieuws/Volkskrant/volkskrant
|
||||
```
|
||||
|
||||
## 2. Teksten verwerken: omzetten naar zinnen, parsen, metadata toevoegen
|
||||
|
||||
Uitvoer in `[A-Z]*/corpus/`
|
||||
|
||||
crontab van p209327@colossus
|
||||
|
||||
```
|
||||
# m h dom mon dow command
|
||||
# veel data: elke dag
|
||||
0 1 * * * /net/corpora/nlnieuws/HLN/txt2corpus.sh
|
||||
0 1 * * * /net/corpora/nlnieuws/NOS/txt2corpus.sh
|
||||
0 1 * * * /net/corpora/nlnieuws/NU/txt2corpus.sh
|
||||
0 1 * * * /net/corpora/nlnieuws/NieuwsNL/txt2corpus.sh
|
||||
0 1 * * * /net/corpora/nlnieuws/VRT/txt2corpus.sh
|
||||
0 1 * * * /net/corpora/nlnieuws/Volkskrant/txt2corpus.sh
|
||||
# weinig data: alleen op dinsdag
|
||||
0 1 * * 2 /net/corpora/nlnieuws/AT5/txt2corpus.sh
|
||||
0 1 * * 2 /net/corpora/nlnieuws/BuurtAdam/txt2corpus.sh
|
||||
0 1 * * 2 /net/corpora/nlnieuws/BuurtGrn/txt2corpus.sh
|
||||
0 1 * * 2 /net/corpora/nlnieuws/GG/txt2corpus.sh
|
||||
0 1 * * 2 /net/corpora/nlnieuws/LitNL/txt2corpus.sh
|
||||
0 1 * * 2 /net/corpora/nlnieuws/Oog/txt2corpus.sh
|
||||
0 1 * * 2 /net/corpora/nlnieuws/Parool/txt2corpus.sh
|
||||
0 1 * * 2 /net/corpora/nlnieuws/RO/txt2corpus.sh
|
||||
0 1 * * 2 /net/corpora/nlnieuws/RTVNoord/txt2corpus.sh
|
||||
0 1 * * 2 /net/corpora/nlnieuws/Sargasso/txt2corpus.sh
|
||||
0 1 * * 2 /net/corpora/nlnieuws/Sikkom/txt2corpus.sh
|
||||
0 1 * * 2 /net/corpora/nlnieuws/Tzum/txt2corpus.sh
|
||||
```
|
||||
|
||||
## 3. Queries uitvoeren, tellingen doen
|
||||
|
||||
Tellingen in `data/`
|
||||
|
||||
Gegevens voor webapp in `data/json/`
|
||||
|
||||
Op woensdag
|
||||
|
||||
crontab van p209327@colossus
|
||||
|
||||
```
|
||||
# m h dom mon dow command
|
||||
0 1 * * 3 /net/corpora/nlnieuws/collect.sh
|
||||
```
|
||||
|
||||
## 4. Data in json op webplatform zetten
|
||||
|
||||
crontab van f109308@colossus
|
||||
|
||||
```
|
||||
# m h dom mon dow command
|
||||
30 0-23/4 * * * rsync -e 'ssh -F /net/aistaff/alfa/.ssh/config' -a --no-g /net/corpora/nlnieuws/data/json/ webalfa:/home/www/f109308/site/wvdm/data
|
||||
```
|
||||
@@ -3,11 +3,11 @@ all: \
|
||||
metadata \
|
||||
ro
|
||||
|
||||
xml2txt: cmd/xml2txt/*.go
|
||||
go build -o $@ $^
|
||||
xml2txt: cmd/xml2txt/*.go ../internal/util/*.go
|
||||
go build -o $@ $<
|
||||
|
||||
metadata: cmd/metadata/*.go
|
||||
go build -o $@ $^
|
||||
|
||||
ro: cmd/ro/*.go
|
||||
go build -o $@ $^
|
||||
ro: cmd/ro/*.go ../internal/util/*.go
|
||||
go build -o $@ $<
|
||||
|
||||
@@ -3,13 +3,14 @@ package main
|
||||
import (
|
||||
e "codeberg.org/pebbe/errors"
|
||||
|
||||
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
|
||||
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
@@ -46,7 +47,7 @@ func main() {
|
||||
}()
|
||||
|
||||
myLock := "/net/corpora/nlnieuws/RO/lock"
|
||||
mkLock(myLock)
|
||||
u.MkLock(myLock)
|
||||
defer func() {
|
||||
_ = os.Remove(myLock)
|
||||
}()
|
||||
@@ -76,7 +77,7 @@ func main() {
|
||||
}
|
||||
p(err)
|
||||
year, week := t.ISOWeek()
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/RO/%d/%02d", year, week)
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/RO/%d/w%02d", year, week)
|
||||
if exists(dirname + "/lock") {
|
||||
continue
|
||||
}
|
||||
@@ -111,16 +112,3 @@ func main() {
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
func mkLock(filename string) {
|
||||
pid := os.Getpid()
|
||||
link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
|
||||
p(os.Symlink(link, filename))
|
||||
|
||||
name, err := os.Readlink(filename)
|
||||
p(err)
|
||||
|
||||
if name != link {
|
||||
p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,6 +5,8 @@ import (
|
||||
"github.com/jbowtie/gokogiri"
|
||||
"github.com/pebbe/textcat/v2"
|
||||
|
||||
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
|
||||
|
||||
"bytes"
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
@@ -24,7 +26,7 @@ var (
|
||||
x = e.ExitErr
|
||||
w = e.WarnErr
|
||||
|
||||
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`)
|
||||
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]\.[0-5][0-9]$`)
|
||||
)
|
||||
|
||||
func main() {
|
||||
@@ -36,16 +38,16 @@ func main() {
|
||||
switch len(os.Args) {
|
||||
case 1:
|
||||
year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
|
||||
ds = fmt.Sprintf("%d-%02d", year, week)
|
||||
ds = fmt.Sprintf("%d.%02d", year, week)
|
||||
case 2:
|
||||
if !reYearWeek.MatchString(os.Args[1]) {
|
||||
x(fmt.Errorf("arg must be yyyy-ww"))
|
||||
x(fmt.Errorf("arg must be yyyy.ww"))
|
||||
}
|
||||
ds = os.Args[1]
|
||||
default:
|
||||
x(fmt.Errorf("too many arguments"))
|
||||
}
|
||||
dp := ds[:4] + "/" + ds[5:]
|
||||
dp := ds[:4] + "/w" + ds[5:]
|
||||
|
||||
x(os.Chdir("/net/corpora/nlnieuws/RO/" + dp))
|
||||
x(os.MkdirAll("out", 0777))
|
||||
@@ -61,19 +63,19 @@ func main() {
|
||||
var buf bytes.Buffer
|
||||
var item Item
|
||||
x(xml.Unmarshal(b, &item))
|
||||
x(buf.WriteString(addEnd(fixSpace(item.Title))))
|
||||
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
|
||||
x(buf.WriteString(u.AddEnd(u.FixSpace(item.Title))))
|
||||
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + u.HtmlFixString(item.Text) + `</body></html>`))
|
||||
x(err)
|
||||
root := doc.Root()
|
||||
divs, err := root.Search(`//div[@class="donatieformlinks"]`)
|
||||
x(err)
|
||||
for _, div := range divs {
|
||||
div.Remove()
|
||||
for i := len(divs) - 1; i >= 0; i-- {
|
||||
divs[i].Remove()
|
||||
}
|
||||
pp, err := root.Search(`//body//p[not(.//a[contains(@href,"reportersonline.nl/support")])]`)
|
||||
x(err)
|
||||
for _, p := range pp {
|
||||
x(buf.WriteString(addEnd(fixSpace(p.Content()))))
|
||||
x(buf.WriteString(u.AddEnd(u.FixSpace(p.Content()))))
|
||||
}
|
||||
|
||||
text := buf.String()
|
||||
@@ -90,33 +92,12 @@ func main() {
|
||||
fp, err := os.Create("out/" + filename[:len(filename)-4] + ".txt")
|
||||
x(err)
|
||||
for _, cat := range item.Cats {
|
||||
x(fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(cat)))
|
||||
t := u.FixSpace(cat)
|
||||
if t != "Artikelen" && t != "cafeyn" {
|
||||
x(fmt.Fprintf(fp, "##META text tag = %s\n", t))
|
||||
}
|
||||
}
|
||||
x(fp.WriteString(text))
|
||||
x(fp.Close())
|
||||
}
|
||||
}
|
||||
|
||||
func addEnd(s string) string {
|
||||
s = strings.TrimSpace(s)
|
||||
n := len(s)
|
||||
if n == 0 {
|
||||
return ""
|
||||
}
|
||||
if n > 0 {
|
||||
if strings.ContainsAny(s[n-1:], ".!?") {
|
||||
return s + "\n"
|
||||
}
|
||||
}
|
||||
if n > 1 {
|
||||
s2 := s[n-2:]
|
||||
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
|
||||
return s + "\n"
|
||||
}
|
||||
}
|
||||
return s + ".\n"
|
||||
}
|
||||
|
||||
func fixSpace(s string) string {
|
||||
return strings.Join(strings.Fields(s), " ")
|
||||
}
|
||||
|
||||
@@ -2,17 +2,20 @@
|
||||
|
||||
set -e
|
||||
|
||||
BASE=/net/corpora/nlnieuws
|
||||
PART=$BASE/RO
|
||||
|
||||
unset CDPATH
|
||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
||||
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
|
||||
export TZ=Europe/Amsterdam
|
||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||
|
||||
if [ "$1" = "" ]
|
||||
then
|
||||
ds=`date -d -7days +%G-%V`
|
||||
ds=`date -d -7days +%G.%V`
|
||||
else
|
||||
case "$1" in
|
||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
||||
2[0-9][0-9][0-9].[0-5][0-9])
|
||||
ds=$1
|
||||
;;
|
||||
*)
|
||||
@@ -22,11 +25,13 @@ else
|
||||
esac
|
||||
fi
|
||||
|
||||
dp=${ds//-//}
|
||||
year=${ds%.*}
|
||||
week=${ds#*.}
|
||||
dp=$year/w$week
|
||||
corpus=$PART/corpus/$year/$ds
|
||||
mkdir -p $PART/corpus/$year
|
||||
|
||||
corpus=/net/corpora/nlnieuws/RO/corpus/$ds
|
||||
|
||||
cd /net/corpora/nlnieuws/RO/$dp
|
||||
cd $PART/$dp
|
||||
|
||||
ln -s lock.$$ lock
|
||||
if [ "`readlink lock`" != lock.$$ ]
|
||||
@@ -38,7 +43,7 @@ fi
|
||||
rm -fr out
|
||||
mkdir out
|
||||
|
||||
../../xml2txt $ds
|
||||
xml2txt $ds
|
||||
|
||||
rm -f $corpus.lines
|
||||
for i in out/*.txt
|
||||
@@ -53,14 +58,14 @@ cd out
|
||||
mkdir xml
|
||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||
|
||||
../../../metadata
|
||||
metadata
|
||||
|
||||
cd xml
|
||||
rm -f $corpus.data.dz $corpus.index
|
||||
alto -q -o $corpus.data.dz *.xml
|
||||
|
||||
# telling per bericht, niet per zin
|
||||
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
|
||||
cd ../..
|
||||
rm -fr out
|
||||
|
||||
@@ -5,5 +5,5 @@ all: \
|
||||
metadata: cmd/metadata/*.go
|
||||
go build -o $@ $^
|
||||
|
||||
rtvnoord: cmd/rtvnoord/*.go
|
||||
go build -o $@ $^
|
||||
rtvnoord: cmd/rtvnoord/*.go ../internal/util/*.go
|
||||
go build -o $@ $<
|
||||
|
||||
@@ -3,6 +3,8 @@ package main
|
||||
import (
|
||||
e "codeberg.org/pebbe/errors"
|
||||
|
||||
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
|
||||
|
||||
"encoding/json"
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
@@ -10,7 +12,6 @@ import (
|
||||
"io"
|
||||
"net/http"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
@@ -75,7 +76,7 @@ func main() {
|
||||
}()
|
||||
|
||||
myLock := "/net/corpora/nlnieuws/RTVNoord/lock"
|
||||
mkLock(myLock)
|
||||
u.MkLock(myLock)
|
||||
defer func() {
|
||||
_ = os.Remove(myLock)
|
||||
}()
|
||||
@@ -100,7 +101,7 @@ func main() {
|
||||
}
|
||||
p(err)
|
||||
year, week := t.ISOWeek()
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/RTVNoord/%d/%02d", year, week)
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/RTVNoord/%d/w%02d", year, week)
|
||||
if exists(dirname + "/lock") {
|
||||
continue
|
||||
}
|
||||
@@ -225,7 +226,7 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
|
||||
// text bevat kopjes zonder punt aan het eind
|
||||
lines := strings.Split(doc.Text, "\n")
|
||||
for i, line := range lines {
|
||||
lines[i] = addEnd(fixSpace(line))
|
||||
lines[i] = u.AddEnd(u.FixSpace(line, true))
|
||||
}
|
||||
text := strings.Join(lines, "") + "\n"
|
||||
|
||||
@@ -235,16 +236,21 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
|
||||
p(fmt.Fprintln(fp, "##META text tag ="))
|
||||
} else {
|
||||
for _, tag := range doc.Tags {
|
||||
p(fmt.Fprintf(fp, "##META text tag = %s\n", strings.ToLower(fixSpace(tag))))
|
||||
t := strings.ToLower(u.FixSpace(tag))
|
||||
if strings.HasPrefix(t, "br_") {
|
||||
continue
|
||||
}
|
||||
t = strings.TrimPrefix(t, "tr_")
|
||||
p(fmt.Fprintf(fp, "##META text tag = %s\n", t))
|
||||
}
|
||||
}
|
||||
if doc.Cat == "" {
|
||||
p(fmt.Fprintln(fp, "##META text cat ="))
|
||||
} else {
|
||||
p(fmt.Fprintf(fp, "##META text cat = %s\n", fixSpace(doc.Cat)))
|
||||
p(fmt.Fprintf(fp, "##META text cat = %s\n", u.FixSpace(doc.Cat)))
|
||||
}
|
||||
|
||||
p(fp.WriteString(addEnd(doc.Title)))
|
||||
p(fp.WriteString(u.AddEnd(doc.Title)))
|
||||
|
||||
p(fp.WriteString(text))
|
||||
p(fp.Close())
|
||||
@@ -252,40 +258,3 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
func addEnd(s string) string {
|
||||
s = strings.TrimSpace(s)
|
||||
n := len(s)
|
||||
if n == 0 {
|
||||
return ""
|
||||
}
|
||||
if n > 0 {
|
||||
if strings.ContainsAny(s[n-1:], ".!?") {
|
||||
return s + "\n"
|
||||
}
|
||||
}
|
||||
if n > 1 {
|
||||
s2 := s[n-2:]
|
||||
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
|
||||
return s + "\n"
|
||||
}
|
||||
}
|
||||
return s + ".\n"
|
||||
}
|
||||
|
||||
func fixSpace(s string) string {
|
||||
return strings.Join(strings.Fields(s), " ")
|
||||
}
|
||||
|
||||
func mkLock(filename string) {
|
||||
pid := os.Getpid()
|
||||
link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
|
||||
p(os.Symlink(link, filename))
|
||||
|
||||
name, err := os.Readlink(filename)
|
||||
p(err)
|
||||
|
||||
if name != link {
|
||||
p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,17 +2,20 @@
|
||||
|
||||
set -e
|
||||
|
||||
BASE=/net/corpora/nlnieuws
|
||||
PART=$BASE/RTVNoord
|
||||
|
||||
unset CDPATH
|
||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
||||
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
|
||||
export TZ=Europe/Amsterdam
|
||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||
|
||||
if [ "$1" = "" ]
|
||||
then
|
||||
ds=`date -d -7days +%G-%V`
|
||||
ds=`date -d -7days +%G.%V`
|
||||
else
|
||||
case "$1" in
|
||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
||||
2[0-9][0-9][0-9].[0-5][0-9])
|
||||
ds=$1
|
||||
;;
|
||||
*)
|
||||
@@ -22,11 +25,13 @@ else
|
||||
esac
|
||||
fi
|
||||
|
||||
dp=${ds//-//}
|
||||
year=${ds%.*}
|
||||
week=${ds#*.}
|
||||
dp=$year/w$week
|
||||
corpus=$PART/corpus/$year/$ds
|
||||
mkdir -p $PART/corpus/$year
|
||||
|
||||
corpus=/net/corpora/nlnieuws/RTVNoord/corpus/$ds
|
||||
|
||||
cd /net/corpora/nlnieuws/RTVNoord/$dp
|
||||
cd $PART/$dp
|
||||
|
||||
ln -s lock.$$ lock
|
||||
if [ "`readlink lock`" != lock.$$ ]
|
||||
@@ -51,15 +56,15 @@ cd out
|
||||
mkdir xml
|
||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||
|
||||
../../../metadata
|
||||
metadata
|
||||
|
||||
cd xml
|
||||
rm -f $corpus.data.dz $corpus.index
|
||||
alto -q -o $corpus.data.dz *.xml
|
||||
|
||||
# telling per bericht, niet per zin
|
||||
/net/corpora/nlnieuws/namen.sh -x C -s $corpus.data.dz > $corpus.cat.txt
|
||||
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
query.sh -x C -s $corpus.data.dz > $corpus.cat.txt
|
||||
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
|
||||
cd ../..
|
||||
rm -fr out
|
||||
|
||||
@@ -3,11 +3,11 @@ all: \
|
||||
metadata \
|
||||
sargasso
|
||||
|
||||
xml2txt: cmd/xml2txt/*.go
|
||||
go build -o $@ $^
|
||||
xml2txt: cmd/xml2txt/*.go ../internal/util/*.go
|
||||
go build -o $@ $<
|
||||
|
||||
metadata: cmd/metadata/*.go
|
||||
go build -o $@ $^
|
||||
|
||||
sargasso: cmd/sargasso/*.go
|
||||
go build -o $@ $^
|
||||
sargasso: cmd/sargasso/*.go ../internal/util/*.go
|
||||
go build -o $@ $<
|
||||
|
||||
@@ -3,13 +3,14 @@ package main
|
||||
import (
|
||||
e "codeberg.org/pebbe/errors"
|
||||
|
||||
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
|
||||
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
@@ -46,7 +47,7 @@ func main() {
|
||||
}()
|
||||
|
||||
myLock := "/net/corpora/nlnieuws/Sargasso/lock"
|
||||
mkLock(myLock)
|
||||
u.MkLock(myLock)
|
||||
defer func() {
|
||||
_ = os.Remove(myLock)
|
||||
}()
|
||||
@@ -76,7 +77,7 @@ func main() {
|
||||
}
|
||||
p(err)
|
||||
year, week := t.ISOWeek()
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Sargasso/%d/%02d", year, week)
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Sargasso/%d/w%02d", year, week)
|
||||
if exists(dirname + "/lock") {
|
||||
continue
|
||||
}
|
||||
@@ -111,16 +112,3 @@ func main() {
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
func mkLock(filename string) {
|
||||
pid := os.Getpid()
|
||||
link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
|
||||
p(os.Symlink(link, filename))
|
||||
|
||||
name, err := os.Readlink(filename)
|
||||
p(err)
|
||||
|
||||
if name != link {
|
||||
p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,6 +4,8 @@ import (
|
||||
e "codeberg.org/pebbe/errors"
|
||||
"github.com/jbowtie/gokogiri"
|
||||
|
||||
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
|
||||
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"os"
|
||||
@@ -21,7 +23,7 @@ type Item struct {
|
||||
var (
|
||||
x = e.ExitErr
|
||||
|
||||
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`)
|
||||
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]\.[0-5][0-9]$`)
|
||||
)
|
||||
|
||||
func main() {
|
||||
@@ -30,16 +32,16 @@ func main() {
|
||||
switch len(os.Args) {
|
||||
case 1:
|
||||
year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
|
||||
ds = fmt.Sprintf("%d-%02d", year, week)
|
||||
ds = fmt.Sprintf("%d.%02d", year, week)
|
||||
case 2:
|
||||
if !reYearWeek.MatchString(os.Args[1]) {
|
||||
x(fmt.Errorf("arg must be yyyy-ww"))
|
||||
x(fmt.Errorf("arg must be yyyy.ww"))
|
||||
}
|
||||
ds = os.Args[1]
|
||||
default:
|
||||
x(fmt.Errorf("too many arguments"))
|
||||
}
|
||||
dp := ds[:4] + "/" + ds[5:]
|
||||
dp := ds[:4] + "/w" + ds[5:]
|
||||
|
||||
x(os.Chdir("/net/corpora/nlnieuws/Sargasso/" + dp))
|
||||
x(os.MkdirAll("out", 0777))
|
||||
@@ -57,41 +59,17 @@ func main() {
|
||||
var item Item
|
||||
x(xml.Unmarshal(b, &item))
|
||||
for _, cat := range item.Cats {
|
||||
x(fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(cat)))
|
||||
x(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(cat)))
|
||||
}
|
||||
x(fp.WriteString(addEnd(fixSpace(item.Title))))
|
||||
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
|
||||
x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
|
||||
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + u.HtmlFixString(item.Text) + `</body></html>`))
|
||||
x(err)
|
||||
root := doc.Root()
|
||||
pp, err := root.Search(`//body//p`)
|
||||
x(err)
|
||||
for _, p := range pp {
|
||||
x(fp.WriteString(addEnd(fixSpace(p.Content()))))
|
||||
x(fp.WriteString(u.AddEnd(u.FixSpace(p.Content()))))
|
||||
}
|
||||
x(fp.Close())
|
||||
}
|
||||
}
|
||||
|
||||
func addEnd(s string) string {
|
||||
s = strings.TrimSpace(s)
|
||||
n := len(s)
|
||||
if n == 0 {
|
||||
return ""
|
||||
}
|
||||
if n > 0 {
|
||||
if strings.ContainsAny(s[n-1:], ".!?") {
|
||||
return s + "\n"
|
||||
}
|
||||
}
|
||||
if n > 1 {
|
||||
s2 := s[n-2:]
|
||||
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
|
||||
return s + "\n"
|
||||
}
|
||||
}
|
||||
return s + ".\n"
|
||||
}
|
||||
|
||||
func fixSpace(s string) string {
|
||||
return strings.Join(strings.Fields(s), " ")
|
||||
}
|
||||
|
||||
@@ -2,17 +2,20 @@
|
||||
|
||||
set -e
|
||||
|
||||
BASE=/net/corpora/nlnieuws
|
||||
PART=$BASE/Sargasso
|
||||
|
||||
unset CDPATH
|
||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
||||
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
|
||||
export TZ=Europe/Amsterdam
|
||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||
|
||||
if [ "$1" = "" ]
|
||||
then
|
||||
ds=`date -d -7days +%G-%V`
|
||||
ds=`date -d -7days +%G.%V`
|
||||
else
|
||||
case "$1" in
|
||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
||||
2[0-9][0-9][0-9].[0-5][0-9])
|
||||
ds=$1
|
||||
;;
|
||||
*)
|
||||
@@ -22,11 +25,13 @@ else
|
||||
esac
|
||||
fi
|
||||
|
||||
dp=${ds//-//}
|
||||
year=${ds%.*}
|
||||
week=${ds#*.}
|
||||
dp=$year/w$week
|
||||
corpus=$PART/corpus/$year/$ds
|
||||
mkdir -p $PART/corpus/$year
|
||||
|
||||
corpus=/net/corpora/nlnieuws/Sargasso/corpus/$ds
|
||||
|
||||
cd /net/corpora/nlnieuws/Sargasso/$dp
|
||||
cd $PART/$dp
|
||||
|
||||
ln -s lock.$$ lock
|
||||
if [ "`readlink lock`" != lock.$$ ]
|
||||
@@ -38,7 +43,7 @@ fi
|
||||
rm -fr out
|
||||
mkdir out
|
||||
|
||||
../../xml2txt $ds
|
||||
xml2txt $ds
|
||||
|
||||
rm -f $corpus.lines
|
||||
for i in out/*.txt
|
||||
@@ -53,14 +58,14 @@ cd out
|
||||
mkdir xml
|
||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||
|
||||
../../../metadata
|
||||
metadata
|
||||
|
||||
cd xml
|
||||
rm -f $corpus.data.dz $corpus.index
|
||||
alto -q -o $corpus.data.dz *.xml
|
||||
|
||||
# telling per bericht, niet per zin
|
||||
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
|
||||
cd ../..
|
||||
rm -fr out
|
||||
|
||||
@@ -5,5 +5,5 @@ all: \
|
||||
metadata: cmd/metadata/*.go
|
||||
go build -o $@ $^
|
||||
|
||||
sikkom: cmd/sikkom/*.go
|
||||
go build -o $@ $^
|
||||
sikkom: cmd/sikkom/*.go ../internal/util/*.go
|
||||
go build -o $@ $<
|
||||
|
||||
@@ -4,6 +4,8 @@ import (
|
||||
e "codeberg.org/pebbe/errors"
|
||||
"github.com/jbowtie/gokogiri"
|
||||
|
||||
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
|
||||
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"html"
|
||||
@@ -11,7 +13,6 @@ import (
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
@@ -64,7 +65,7 @@ func main() {
|
||||
}()
|
||||
|
||||
myLock := "/net/corpora/nlnieuws/Sikkom/lock"
|
||||
mkLock(myLock)
|
||||
u.MkLock(myLock)
|
||||
defer func() {
|
||||
_ = os.Remove(myLock)
|
||||
}()
|
||||
@@ -89,7 +90,7 @@ func main() {
|
||||
}
|
||||
p(err)
|
||||
year, week := t.ISOWeek()
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Sikkom/%d/%02d", year, week)
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Sikkom/%d/w%02d", year, week)
|
||||
if exists(dirname + "/lock") {
|
||||
continue
|
||||
}
|
||||
@@ -151,6 +152,8 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
|
||||
p(err)
|
||||
p(resp.Body.Close())
|
||||
|
||||
body = u.HtmlFix(body)
|
||||
|
||||
s := string(body)
|
||||
|
||||
ok := true
|
||||
@@ -220,49 +223,12 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
|
||||
fp, err = os.Create(filename + ".txt")
|
||||
p(err)
|
||||
|
||||
p(fp.WriteString(addEnd(fixSpace(title))))
|
||||
p(fp.WriteString(u.AddEnd(u.FixSpace(title))))
|
||||
|
||||
for _, p1 := range pp {
|
||||
p(fp.WriteString(addEnd(fixSpace(p1.Content()))))
|
||||
p(fp.WriteString(u.AddEnd(u.FixSpace(p1.Content()))))
|
||||
}
|
||||
|
||||
p(fp.Close())
|
||||
return true
|
||||
}
|
||||
|
||||
func addEnd(s string) string {
|
||||
s = strings.TrimSpace(s)
|
||||
n := len(s)
|
||||
if n == 0 {
|
||||
return ""
|
||||
}
|
||||
if n > 0 {
|
||||
if strings.ContainsAny(s[n-1:], ".!?") {
|
||||
return s + "\n"
|
||||
}
|
||||
}
|
||||
if n > 1 {
|
||||
s2 := s[n-2:]
|
||||
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
|
||||
return s + "\n"
|
||||
}
|
||||
}
|
||||
return s + ".\n"
|
||||
}
|
||||
|
||||
func fixSpace(s string) string {
|
||||
return strings.Join(strings.Fields(s), " ")
|
||||
}
|
||||
|
||||
func mkLock(filename string) {
|
||||
pid := os.Getpid()
|
||||
link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
|
||||
p(os.Symlink(link, filename))
|
||||
|
||||
name, err := os.Readlink(filename)
|
||||
p(err)
|
||||
|
||||
if name != link {
|
||||
p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,17 +2,20 @@
|
||||
|
||||
set -e
|
||||
|
||||
BASE=/net/corpora/nlnieuws
|
||||
PART=$BASE/Sikkom
|
||||
|
||||
unset CDPATH
|
||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
||||
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
|
||||
export TZ=Europe/Amsterdam
|
||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||
|
||||
if [ "$1" = "" ]
|
||||
then
|
||||
ds=`date -d -7days +%G-%V`
|
||||
ds=`date -d -7days +%G.%V`
|
||||
else
|
||||
case "$1" in
|
||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
||||
2[0-9][0-9][0-9].[0-5][0-9])
|
||||
ds=$1
|
||||
;;
|
||||
*)
|
||||
@@ -22,11 +25,13 @@ else
|
||||
esac
|
||||
fi
|
||||
|
||||
dp=${ds//-//}
|
||||
year=${ds%.*}
|
||||
week=${ds#*.}
|
||||
dp=$year/w$week
|
||||
corpus=$PART/corpus/$year/$ds
|
||||
mkdir -p $PART/corpus/$year
|
||||
|
||||
corpus=/net/corpora/nlnieuws/Sikkom/corpus/$ds
|
||||
|
||||
cd /net/corpora/nlnieuws/Sikkom/$dp
|
||||
cd $PART/$dp
|
||||
|
||||
ln -s lock.$$ lock
|
||||
if [ "`readlink lock`" != lock.$$ ]
|
||||
@@ -51,7 +56,7 @@ cd out
|
||||
mkdir xml
|
||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||
|
||||
../../../metadata
|
||||
metadata
|
||||
|
||||
cd xml
|
||||
rm -f $corpus.data.dz $corpus.index
|
||||
|
||||
@@ -3,11 +3,11 @@ all: \
|
||||
metadata \
|
||||
tzum
|
||||
|
||||
xml2txt: cmd/xml2txt/*.go
|
||||
go build -o $@ $^
|
||||
xml2txt: cmd/xml2txt/*.go ../internal/util/*.go
|
||||
go build -o $@ $<
|
||||
|
||||
metadata: cmd/metadata/*.go
|
||||
go build -o $@ $^
|
||||
|
||||
tzum: cmd/tzum/*.go
|
||||
go build -o $@ $^
|
||||
tzum: cmd/tzum/*.go ../internal/util/*.go
|
||||
go build -o $@ $<
|
||||
|
||||
@@ -3,13 +3,14 @@ package main
|
||||
import (
|
||||
e "codeberg.org/pebbe/errors"
|
||||
|
||||
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
|
||||
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
@@ -46,7 +47,7 @@ func main() {
|
||||
}()
|
||||
|
||||
myLock := "/net/corpora/nlnieuws/Tzum/lock"
|
||||
mkLock(myLock)
|
||||
u.MkLock(myLock)
|
||||
defer func() {
|
||||
_ = os.Remove(myLock)
|
||||
}()
|
||||
@@ -76,7 +77,7 @@ func main() {
|
||||
}
|
||||
p(err)
|
||||
year, week := t.ISOWeek()
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Tzum/%d/%02d", year, week)
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Tzum/%d/w%02d", year, week)
|
||||
if exists(dirname + "/lock") {
|
||||
continue
|
||||
}
|
||||
@@ -111,16 +112,3 @@ func main() {
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
func mkLock(filename string) {
|
||||
pid := os.Getpid()
|
||||
link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
|
||||
p(os.Symlink(link, filename))
|
||||
|
||||
name, err := os.Readlink(filename)
|
||||
p(err)
|
||||
|
||||
if name != link {
|
||||
p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,6 +4,8 @@ import (
|
||||
e "codeberg.org/pebbe/errors"
|
||||
"github.com/jbowtie/gokogiri"
|
||||
|
||||
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
|
||||
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"os"
|
||||
@@ -21,7 +23,7 @@ type Item struct {
|
||||
var (
|
||||
x = e.ExitErr
|
||||
|
||||
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`)
|
||||
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]\.[0-5][0-9]$`)
|
||||
)
|
||||
|
||||
func main() {
|
||||
@@ -30,16 +32,16 @@ func main() {
|
||||
switch len(os.Args) {
|
||||
case 1:
|
||||
year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
|
||||
ds = fmt.Sprintf("%d-%02d", year, week)
|
||||
ds = fmt.Sprintf("%d.%02d", year, week)
|
||||
case 2:
|
||||
if !reYearWeek.MatchString(os.Args[1]) {
|
||||
x(fmt.Errorf("arg must be yyyy-ww"))
|
||||
x(fmt.Errorf("arg must be yyyy.ww"))
|
||||
}
|
||||
ds = os.Args[1]
|
||||
default:
|
||||
x(fmt.Errorf("too many arguments"))
|
||||
}
|
||||
dp := ds[:4] + "/" + ds[5:]
|
||||
dp := ds[:4] + "/w" + ds[5:]
|
||||
|
||||
x(os.Chdir("/net/corpora/nlnieuws/Tzum/" + dp))
|
||||
x(os.MkdirAll("out", 0777))
|
||||
@@ -57,10 +59,14 @@ func main() {
|
||||
var item Item
|
||||
x(xml.Unmarshal(b, &item))
|
||||
for _, cat := range item.Cats {
|
||||
x(fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(cat)))
|
||||
t := u.FixSpace(cat)
|
||||
if t == "Nieuws" {
|
||||
continue
|
||||
}
|
||||
x(fmt.Fprintf(fp, "##META text tag = %s\n", t))
|
||||
}
|
||||
x(fp.WriteString(addEnd(fixSpace(item.Title))))
|
||||
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
|
||||
x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
|
||||
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + u.HtmlFixString(item.Text) + `</body></html>`))
|
||||
x(err)
|
||||
root := doc.Root()
|
||||
pp, err := root.Search(`//body/p`)
|
||||
@@ -68,33 +74,9 @@ func main() {
|
||||
for _, p := range pp {
|
||||
s := p.Content()
|
||||
if !strings.Contains(s, "verscheen eerst op Tzum.") {
|
||||
x(fp.WriteString(addEnd(fixSpace(p.Content()))))
|
||||
x(fp.WriteString(u.AddEnd(u.FixSpace(p.Content()))))
|
||||
}
|
||||
}
|
||||
x(fp.Close())
|
||||
}
|
||||
}
|
||||
|
||||
func addEnd(s string) string {
|
||||
s = strings.TrimSpace(s)
|
||||
n := len(s)
|
||||
if n == 0 {
|
||||
return ""
|
||||
}
|
||||
if n > 0 {
|
||||
if strings.ContainsAny(s[n-1:], ".!?") {
|
||||
return s + "\n"
|
||||
}
|
||||
}
|
||||
if n > 1 {
|
||||
s2 := s[n-2:]
|
||||
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
|
||||
return s + "\n"
|
||||
}
|
||||
}
|
||||
return s + ".\n"
|
||||
}
|
||||
|
||||
func fixSpace(s string) string {
|
||||
return strings.Join(strings.Fields(s), " ")
|
||||
}
|
||||
|
||||
@@ -2,17 +2,20 @@
|
||||
|
||||
set -e
|
||||
|
||||
BASE=/net/corpora/nlnieuws
|
||||
PART=$BASE/Tzum
|
||||
|
||||
unset CDPATH
|
||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
||||
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
|
||||
export TZ=Europe/Amsterdam
|
||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||
|
||||
if [ "$1" = "" ]
|
||||
then
|
||||
ds=`date -d -7days +%G-%V`
|
||||
ds=`date -d -7days +%G.%V`
|
||||
else
|
||||
case "$1" in
|
||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
||||
2[0-9][0-9][0-9].[0-5][0-9])
|
||||
ds=$1
|
||||
;;
|
||||
*)
|
||||
@@ -22,11 +25,13 @@ else
|
||||
esac
|
||||
fi
|
||||
|
||||
dp=${ds//-//}
|
||||
year=${ds%.*}
|
||||
week=${ds#*.}
|
||||
dp=$year/w$week
|
||||
corpus=$PART/corpus/$year/$ds
|
||||
mkdir -p $PART/corpus/$year
|
||||
|
||||
corpus=/net/corpora/nlnieuws/Tzum/corpus/$ds
|
||||
|
||||
cd /net/corpora/nlnieuws/Tzum/$dp
|
||||
cd $PART/$dp
|
||||
|
||||
ln -s lock.$$ lock
|
||||
if [ "`readlink lock`" != lock.$$ ]
|
||||
@@ -38,7 +43,7 @@ fi
|
||||
rm -fr out
|
||||
mkdir out
|
||||
|
||||
../../xml2txt $ds
|
||||
xml2txt $ds
|
||||
|
||||
rm -f $corpus.lines
|
||||
for i in out/*.txt
|
||||
@@ -53,14 +58,14 @@ cd out
|
||||
mkdir xml
|
||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||
|
||||
../../../metadata
|
||||
metadata
|
||||
|
||||
cd xml
|
||||
rm -f $corpus.data.dz $corpus.index
|
||||
alto -q -o $corpus.data.dz *.xml
|
||||
|
||||
# telling per bericht, niet per zin
|
||||
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
|
||||
cd ../..
|
||||
rm -fr out
|
||||
|
||||
@@ -5,5 +5,5 @@ all: \
|
||||
metadata: cmd/metadata/*.go
|
||||
go build -o $@ $^
|
||||
|
||||
vrt: cmd/vrt/*.go
|
||||
go build -o $@ $^
|
||||
vrt: cmd/vrt/*.go ../internal/util/*.go
|
||||
go build -o $@ $<
|
||||
|
||||
@@ -4,6 +4,8 @@ import (
|
||||
e "codeberg.org/pebbe/errors"
|
||||
"github.com/jbowtie/gokogiri"
|
||||
|
||||
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
|
||||
|
||||
"bytes"
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
@@ -11,7 +13,6 @@ import (
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
@@ -78,7 +79,7 @@ func main() {
|
||||
}()
|
||||
|
||||
myLock := "/net/corpora/nlnieuws/VRT/lock"
|
||||
mkLock(myLock)
|
||||
u.MkLock(myLock)
|
||||
defer func() {
|
||||
_ = os.Remove(myLock)
|
||||
}()
|
||||
@@ -109,8 +110,7 @@ func main() {
|
||||
if t2.After(t) {
|
||||
t = t2
|
||||
}
|
||||
year, week := t.ISOWeek()
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/VRT/%d/%02d", year, week)
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/VRT/%d/%02d/%02d", t.Year(), int(t.Month()), t.Day())
|
||||
if exists(dirname + "/lock") {
|
||||
continue
|
||||
}
|
||||
@@ -179,6 +179,8 @@ func doArticle(filename string, url string, title string, tags []string, cats []
|
||||
p(err)
|
||||
p(resp.Body.Close())
|
||||
|
||||
body = u.HtmlFix(body)
|
||||
|
||||
/*
|
||||
s := string(body)
|
||||
ok := true
|
||||
@@ -242,18 +244,18 @@ func doArticle(filename string, url string, title string, tags []string, cats []
|
||||
p(fmt.Fprintln(&buf, "##META text cat ="))
|
||||
} else {
|
||||
for _, cat := range cats {
|
||||
p(fmt.Fprintf(&buf, "##META text cat = %s\n", fixSpace(cat)))
|
||||
p(fmt.Fprintf(&buf, "##META text cat = %s\n", u.FixSpace(cat)))
|
||||
}
|
||||
}
|
||||
if len(tags) == 0 {
|
||||
p(fmt.Fprintln(&buf, "##META text tag ="))
|
||||
} else {
|
||||
for _, tag := range tags {
|
||||
p(fmt.Fprintf(&buf, "##META text tag = %s\n", fixSpace(tag)))
|
||||
p(fmt.Fprintf(&buf, "##META text tag = %s\n", u.FixSpace(tag)))
|
||||
}
|
||||
}
|
||||
|
||||
_, err = buf.WriteString(addEnd(fixSpace(title)))
|
||||
_, err = buf.WriteString(u.AddEnd(u.FixSpace(title)))
|
||||
p(err)
|
||||
|
||||
fouten := make([]string, 0)
|
||||
@@ -262,7 +264,7 @@ func doArticle(filename string, url string, title string, tags []string, cats []
|
||||
pp, err := root.Search(`//div[@data-sentry-component="ArticleHeading"]//*[contains(@class,"prose-article-body-r")]`)
|
||||
p(err)
|
||||
for _, p1 := range pp {
|
||||
p(fmt.Fprint(&buf, addEnd(fixSpace(p1.Content()))))
|
||||
p(fmt.Fprint(&buf, u.AddEnd(u.FixSpace(p1.Content()))))
|
||||
found = true
|
||||
}
|
||||
if !found {
|
||||
@@ -277,7 +279,7 @@ func doArticle(filename string, url string, title string, tags []string, cats []
|
||||
`//div[@data-sentry-component="ArticleTitle"]//h2`)
|
||||
p(err)
|
||||
for _, p1 := range pp {
|
||||
p(fmt.Fprint(&buf, addEnd(fixSpace(p1.Content()))))
|
||||
p(fmt.Fprint(&buf, u.AddEnd(u.FixSpace(p1.Content()))))
|
||||
found = true
|
||||
}
|
||||
if !found {
|
||||
@@ -311,40 +313,3 @@ func doArticle(filename string, url string, title string, tags []string, cats []
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
func addEnd(s string) string {
|
||||
s = strings.TrimSpace(s)
|
||||
n := len(s)
|
||||
if n == 0 {
|
||||
return ""
|
||||
}
|
||||
if n > 0 {
|
||||
if strings.ContainsAny(s[n-1:], ".!?") {
|
||||
return s + "\n"
|
||||
}
|
||||
}
|
||||
if n > 1 {
|
||||
s2 := s[n-2:]
|
||||
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
|
||||
return s + "\n"
|
||||
}
|
||||
}
|
||||
return s + ".\n"
|
||||
}
|
||||
|
||||
func fixSpace(s string) string {
|
||||
return strings.Join(strings.Fields(s), " ")
|
||||
}
|
||||
|
||||
func mkLock(filename string) {
|
||||
pid := os.Getpid()
|
||||
link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
|
||||
p(os.Symlink(link, filename))
|
||||
|
||||
name, err := os.Readlink(filename)
|
||||
p(err)
|
||||
|
||||
if name != link {
|
||||
p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,17 +2,20 @@
|
||||
|
||||
set -e
|
||||
|
||||
BASE=/net/corpora/nlnieuws
|
||||
PART=$BASE/VRT
|
||||
|
||||
unset CDPATH
|
||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
||||
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
|
||||
export TZ=Europe/Amsterdam
|
||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||
|
||||
if [ "$1" = "" ]
|
||||
then
|
||||
ds=`date -d -7days +%G-%V`
|
||||
ds=`date -d -2days +%Y-%m-%d`
|
||||
else
|
||||
case "$1" in
|
||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
||||
2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9])
|
||||
ds=$1
|
||||
;;
|
||||
*)
|
||||
@@ -23,10 +26,11 @@ else
|
||||
fi
|
||||
|
||||
dp=${ds//-//}
|
||||
year=${ds%%-*}
|
||||
corpus=$PART/corpus/$year/$ds
|
||||
mkdir -p $PART/corpus/$year
|
||||
|
||||
corpus=/net/corpora/nlnieuws/VRT/corpus/$ds
|
||||
|
||||
cd /net/corpora/nlnieuws/VRT/$dp
|
||||
cd $PART/$dp
|
||||
|
||||
ln -s lock.$$ lock
|
||||
if [ "`readlink lock`" != lock.$$ ]
|
||||
@@ -51,15 +55,15 @@ cd out
|
||||
mkdir xml
|
||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||
|
||||
../../../metadata
|
||||
metadata
|
||||
|
||||
cd xml
|
||||
rm -f $corpus.data.dz $corpus.index
|
||||
alto -q -o $corpus.data.dz *.xml
|
||||
|
||||
# telling per bericht, niet per zin
|
||||
/net/corpora/nlnieuws/namen.sh -x C -s $corpus.data.dz > $corpus.cat.txt
|
||||
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
query.sh -x C -s $corpus.data.dz > $corpus.cat.txt
|
||||
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
|
||||
cd ../..
|
||||
rm -fr out
|
||||
|
||||
9
Volkskrant/Makefile
Normal file
9
Volkskrant/Makefile
Normal file
@@ -0,0 +1,9 @@
|
||||
all: \
|
||||
metadata \
|
||||
volkskrant
|
||||
|
||||
metadata: cmd/metadata/*.go
|
||||
go build -o $@ $^
|
||||
|
||||
volkskrant: cmd/volkskrant/*.go ../internal/util/*.go
|
||||
go build -o $@ $<
|
||||
131
Volkskrant/cmd/metadata/metadata.go
Normal file
131
Volkskrant/cmd/metadata/metadata.go
Normal file
@@ -0,0 +1,131 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
e "codeberg.org/pebbe/errors"
|
||||
|
||||
"bufio"
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"html"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type Item struct {
|
||||
XMLName xml.Name `xml:"item"`
|
||||
UnixTime int64 `xml:"unixTime"`
|
||||
}
|
||||
|
||||
var (
|
||||
x = e.ExitErr
|
||||
escape = html.EscapeString
|
||||
data = make(map[string][]string)
|
||||
location *time.Location
|
||||
)
|
||||
|
||||
func main() {
|
||||
var err error
|
||||
location, err = time.LoadLocation("Europe/Amsterdam")
|
||||
x(err)
|
||||
|
||||
files, err := os.ReadDir(".")
|
||||
x(err)
|
||||
for _, file := range files {
|
||||
filename := file.Name()
|
||||
if strings.HasSuffix(filename, ".txt") {
|
||||
doText("", filename)
|
||||
} else if strings.HasSuffix(filename, ".xml") {
|
||||
doXml("", filename)
|
||||
}
|
||||
}
|
||||
files, err = os.ReadDir("..")
|
||||
x(err)
|
||||
for _, file := range files {
|
||||
filename := file.Name()
|
||||
if strings.HasSuffix(filename, ".txt") {
|
||||
doText("../", filename)
|
||||
} else if strings.HasSuffix(filename, ".xml") {
|
||||
doXml("../", filename)
|
||||
}
|
||||
}
|
||||
|
||||
files, err = os.ReadDir("xml")
|
||||
x(err)
|
||||
for _, file := range files {
|
||||
filename := file.Name()
|
||||
if !strings.HasSuffix(filename, ".xml") {
|
||||
continue
|
||||
}
|
||||
aa := strings.Split(filename, ".")
|
||||
base := strings.Join(aa[1:len(aa)-2], ".")
|
||||
b, err := os.ReadFile("xml/" + filename)
|
||||
x(err)
|
||||
s := string(b)
|
||||
i := strings.Index(s, "<alpino") + 1
|
||||
i += strings.Index(s[i:], "<")
|
||||
fp, err := os.Create("xml/" + filename + ".tmp")
|
||||
x(err)
|
||||
x(fp.WriteString(s[:i]))
|
||||
x(fp.WriteString("<metadata>\n <meta type=\"text\" name=\"source\" value=\"Volkskrant\"/>\n"))
|
||||
for _, m := range data[base] {
|
||||
x(fp.WriteString(" " + m + "\n"))
|
||||
}
|
||||
x(fp.WriteString(" </metadata>\n "))
|
||||
x(fp.WriteString(stripMeta(s[i:])))
|
||||
x(fp.Close())
|
||||
x(os.Rename("xml/"+filename+".tmp", "xml/"+filename))
|
||||
}
|
||||
}
|
||||
|
||||
func doText(dirname, filename string) {
|
||||
base := filename[:len(filename)-4]
|
||||
if _, ok := data[base]; !ok {
|
||||
data[base] = make([]string, 0)
|
||||
}
|
||||
fp, err := os.Open(dirname + filename)
|
||||
x(err)
|
||||
defer func() { x(fp.Close()) }()
|
||||
scanner := bufio.NewScanner(fp)
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
if !strings.HasPrefix(line, "##META") {
|
||||
continue
|
||||
}
|
||||
aa := strings.Fields(line)
|
||||
if len(aa) > 4 {
|
||||
data[base] = append(data[base],
|
||||
fmt.Sprintf(`<meta type="%s" name="%s" value="%s"/>`,
|
||||
aa[1],
|
||||
escape(aa[2]),
|
||||
escape(strings.Join(aa[4:], " "))))
|
||||
}
|
||||
}
|
||||
x(scanner.Err())
|
||||
}
|
||||
|
||||
func doXml(dirname, filename string) {
|
||||
base := filename[:len(filename)-4]
|
||||
if _, ok := data[base]; !ok {
|
||||
data[base] = make([]string, 0)
|
||||
}
|
||||
b, err := os.ReadFile(dirname + filename)
|
||||
x(err)
|
||||
var item Item
|
||||
x(xml.Unmarshal(b, &item))
|
||||
t := time.Unix(item.UnixTime, 0).In(location)
|
||||
data[base] = append(data[base],
|
||||
fmt.Sprintf(`<meta type="date" name="pubdate" value="%d-%02d-%02d"/>`,
|
||||
t.Year(),
|
||||
int(t.Month()),
|
||||
t.Day()))
|
||||
}
|
||||
|
||||
func stripMeta(s string) string {
|
||||
i1 := strings.Index(s, "<metadata>")
|
||||
if i1 < 0 {
|
||||
return s
|
||||
}
|
||||
i2 := i1 + strings.Index(s[i1:], "</metadata>") + 11
|
||||
return s[:i1] + strings.TrimLeft(s[i2:], " \t\r\n")
|
||||
}
|
||||
390
Volkskrant/cmd/volkskrant/volkskrant.go
Normal file
390
Volkskrant/cmd/volkskrant/volkskrant.go
Normal file
@@ -0,0 +1,390 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
e "codeberg.org/pebbe/errors"
|
||||
"github.com/jbowtie/gokogiri"
|
||||
|
||||
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
|
||||
|
||||
//"encoding/json"
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
//"html"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type Rss struct {
|
||||
XMLName xml.Name `xml:"rss"`
|
||||
Items []ItemT `xml:"channel>item"`
|
||||
}
|
||||
|
||||
type ItemT struct {
|
||||
PubDate string `xml:"pubDate"`
|
||||
UnixTime int64 `xml:"unixTime"`
|
||||
Guid string `xml:"guid"`
|
||||
Link string `xml:"link"`
|
||||
Title string `xml:"title"`
|
||||
Data []byte `xml:",innerxml"`
|
||||
}
|
||||
|
||||
/*
|
||||
type GraphT struct {
|
||||
Graph []map[string]any `json:"@graph"`
|
||||
}
|
||||
*/
|
||||
|
||||
var (
|
||||
p = e.PanicErr
|
||||
w = e.WarnErr
|
||||
agent = "AhrefsBot/7.0"
|
||||
)
|
||||
|
||||
func exists(filename string) bool {
|
||||
_, err := os.Stat(filename)
|
||||
return err == nil
|
||||
}
|
||||
|
||||
func fileDate(filename string) string {
|
||||
b, err := os.ReadFile(filename)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
s := string(b)
|
||||
i1 := strings.Index(s, "<unixTime>") + 10
|
||||
i2 := strings.Index(s, "</unixTime>")
|
||||
if i2 < i1 {
|
||||
return ""
|
||||
}
|
||||
return s[i1:i2]
|
||||
}
|
||||
|
||||
func main() {
|
||||
defer func() {
|
||||
if e.Panicked {
|
||||
_ = recover()
|
||||
os.Exit(1)
|
||||
}
|
||||
}()
|
||||
|
||||
myLock := "/net/corpora/nlnieuws/Volkskrant/lock"
|
||||
u.MkLock(myLock)
|
||||
defer func() {
|
||||
_ = os.Remove(myLock)
|
||||
}()
|
||||
|
||||
req, err := http.NewRequest("GET", "https://www.volkskrant.nl/rss.xml", nil)
|
||||
p(err)
|
||||
req.Header.Set("User-Agent", agent)
|
||||
|
||||
client := &http.Client{}
|
||||
resp, err := client.Do(req)
|
||||
p(err)
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
p(err)
|
||||
p(resp.Body.Close())
|
||||
|
||||
var rss Rss
|
||||
p(xml.Unmarshal(body, &rss))
|
||||
|
||||
if len(rss.Items) == 0 {
|
||||
p(fmt.Errorf("len(rss.Items) == 0"))
|
||||
}
|
||||
|
||||
for _, item := range rss.Items {
|
||||
t, err := time.Parse(time.RFC1123Z, item.PubDate)
|
||||
if err != nil {
|
||||
t, err = time.Parse(time.RFC1123, item.PubDate)
|
||||
}
|
||||
p(err)
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Volkskrant/%d/%02d/%02d", t.Year(), int(t.Month()), t.Day())
|
||||
if exists(dirname + "/lock") {
|
||||
continue
|
||||
}
|
||||
basename := item.Guid
|
||||
filename := dirname + "/" + url.PathEscape(basename)
|
||||
|
||||
ts := fmt.Sprintf("%d", t.Unix())
|
||||
needUpdate := fileDate(filename+".xml") != ts
|
||||
|
||||
p(os.MkdirAll(dirname, 0777))
|
||||
func() {
|
||||
var ok bool
|
||||
defer func() {
|
||||
if e.Panicked {
|
||||
fmt.Fprintln(os.Stderr, "----", filename)
|
||||
fmt.Fprintln(os.Stderr, "----", item.Link)
|
||||
}
|
||||
if !ok {
|
||||
_ = os.Remove(filename + ".xml")
|
||||
}
|
||||
}()
|
||||
fp, err := os.Create(filename + ".xml")
|
||||
p(err)
|
||||
p(fp.WriteString("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<item>\n"))
|
||||
p(fmt.Fprintf(fp, "<unixTime>%d</unixTime>", t.Unix()))
|
||||
p(fp.Write(item.Data))
|
||||
p(fp.WriteString("</item>\n"))
|
||||
p(fp.Close())
|
||||
p(os.Chtimes(filename+".xml", t, t))
|
||||
ok = doArticle(filename, item.Link, item.Title, t, needUpdate)
|
||||
}()
|
||||
}
|
||||
}
|
||||
|
||||
func doArticle(filename string, url string, title string, timestamp time.Time, needUpdate bool) (ok bool) {
|
||||
if exists(filename + ".skip") {
|
||||
return true
|
||||
}
|
||||
if needUpdate {
|
||||
_ = os.Remove(filename + ".err")
|
||||
_ = os.Remove(filename + ".html")
|
||||
// _ = os.Remove(filename + ".json")
|
||||
_ = os.Remove(filename + ".txt")
|
||||
} else {
|
||||
if exists(filename + ".txt") {
|
||||
return true
|
||||
}
|
||||
}
|
||||
time.Sleep(2 * time.Second)
|
||||
|
||||
req, err := http.NewRequest("GET", url, nil)
|
||||
p(err)
|
||||
req.Header.Set("User-Agent", agent)
|
||||
|
||||
client := &http.Client{}
|
||||
resp, err := client.Do(req)
|
||||
p(err)
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
p(err)
|
||||
p(resp.Body.Close())
|
||||
|
||||
body = u.HtmlFix(body)
|
||||
|
||||
doc, err := gokogiri.ParseHtml(body)
|
||||
p(err)
|
||||
|
||||
/*
|
||||
|
||||
s := string(body)
|
||||
|
||||
ok = true
|
||||
i1 := strings.Index(s, `<script type="application/ld+json"`)
|
||||
if i1 < 0 {
|
||||
ok = false
|
||||
} else {
|
||||
i1 += strings.Index(s[i1:], `>`) + 1
|
||||
i2 := i1 + strings.Index(s[i1:], `</script>`)
|
||||
if i2 < i1 {
|
||||
ok = false
|
||||
} else {
|
||||
s = html.UnescapeString(s[i1:i2])
|
||||
}
|
||||
}
|
||||
if !ok {
|
||||
_ = w(fmt.Errorf("script jsonld not found: %s", url))
|
||||
|
||||
fp, err := os.Create(filename + ".err")
|
||||
p(err)
|
||||
p(fmt.Fprintf(fp, "script jsonld not found: %s\n", url))
|
||||
p(fp.Close())
|
||||
p(os.Chtimes(filename+".err", timestamp, timestamp))
|
||||
|
||||
fp, err = os.Create(filename + ".html")
|
||||
p(err)
|
||||
p(fp.Write(body))
|
||||
p(fp.Close())
|
||||
p(os.Chtimes(filename+".html", timestamp, timestamp))
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
var graph GraphT
|
||||
p(json.Unmarshal([]byte(s), &graph))
|
||||
for _, g := range graph.Graph {
|
||||
t := g["@type"]
|
||||
switch v := t.(type) {
|
||||
case string:
|
||||
if v == "NewsArticle" {
|
||||
b, err := json.Marshal(g)
|
||||
p(err)
|
||||
s = string(b)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fp, err := os.Create(filename + ".json")
|
||||
p(err)
|
||||
p(fp.WriteString(s))
|
||||
p(fp.Close())
|
||||
p(os.Chtimes(filename+".json", timestamp, timestamp))
|
||||
*/
|
||||
|
||||
root := doc.Root()
|
||||
|
||||
articles, err := root.Search(`//article[@id="article-content"]`)
|
||||
p(err)
|
||||
if len(articles) == 0 {
|
||||
_ = w(fmt.Errorf("empty: %s", url))
|
||||
|
||||
fp, err := os.Create(filename + ".err")
|
||||
p(err)
|
||||
p(fmt.Fprintf(fp, "empty: %s\n", url))
|
||||
p(fp.Close())
|
||||
p(os.Chtimes(filename+".err", timestamp, timestamp))
|
||||
|
||||
fp, err = os.Create(filename + ".html")
|
||||
p(err)
|
||||
p(fp.Write(body))
|
||||
p(fp.Close())
|
||||
p(os.Chtimes(filename+".html", timestamp, timestamp))
|
||||
|
||||
return false
|
||||
}
|
||||
article := articles[0]
|
||||
|
||||
live, err := article.Search(`.//*[@data-test-id="live-blog-label"]`)
|
||||
p(err)
|
||||
if len(live) > 0 {
|
||||
fp, err := os.Create(filename + ".skip")
|
||||
p(fp.WriteString("liveblog\n"))
|
||||
p(err)
|
||||
p(os.Chtimes(filename+".skip", timestamp, timestamp))
|
||||
return true
|
||||
}
|
||||
|
||||
headers, err := article.Search(`.//header`)
|
||||
p(err)
|
||||
if len(headers) == 0 {
|
||||
_ = w(fmt.Errorf("no header: %s", url))
|
||||
|
||||
fp, err := os.Create(filename + ".err")
|
||||
p(err)
|
||||
p(fmt.Fprintf(fp, "no elements: %s\n", url))
|
||||
p(fp.Close())
|
||||
p(os.Chtimes(filename+".err", timestamp, timestamp))
|
||||
|
||||
fp, err = os.Create(filename + ".html")
|
||||
p(err)
|
||||
p(fp.Write(body))
|
||||
p(fp.Close())
|
||||
p(os.Chtimes(filename+".html", timestamp, timestamp))
|
||||
|
||||
return false
|
||||
|
||||
}
|
||||
header := headers[0]
|
||||
|
||||
isOpinie := false
|
||||
isColumn := false
|
||||
tags := make([]string, 0)
|
||||
ell, err := header.Search(`.//*[@data-test-id="article-label"]`)
|
||||
p(err)
|
||||
if len(ell) == 0 {
|
||||
_ = w(fmt.Errorf("no labels: %s", url))
|
||||
}
|
||||
for _, el := range ell {
|
||||
s := strings.TrimSpace(el.Content())
|
||||
if s != "" && s != "Nieuws" {
|
||||
tags = append(tags, s)
|
||||
}
|
||||
if s1 := strings.ToLower(s); s1 == "opinie" {
|
||||
isOpinie = true
|
||||
} else if s1 == "column" {
|
||||
isColumn = true
|
||||
}
|
||||
}
|
||||
|
||||
pars := make([]string, 0)
|
||||
|
||||
found := false
|
||||
ell, err = header.Search(`.//*[@data-test-id="header-intro"]`)
|
||||
p(err)
|
||||
for _, el := range ell {
|
||||
s := strings.TrimSpace(el.Content())
|
||||
if s != "" {
|
||||
pars = append(pars, s)
|
||||
found = true
|
||||
}
|
||||
}
|
||||
if !found && !isOpinie && !isColumn {
|
||||
_ = w(fmt.Errorf("no intro: %s", url))
|
||||
}
|
||||
|
||||
specials, err := article.Search(`.//aside | .//figure | .//figcaption | .//section//b`)
|
||||
p(err)
|
||||
for i := len(specials) - 1; i >= 0; i-- {
|
||||
specials[i].Remove()
|
||||
}
|
||||
|
||||
ell, err = article.Search(`.//section//*[@data-article-element-index]`)
|
||||
p(err)
|
||||
if len(ell) == 0 {
|
||||
_ = w(fmt.Errorf("no elements: %s", url))
|
||||
|
||||
fp, err := os.Create(filename + ".err")
|
||||
p(err)
|
||||
p(fmt.Fprintf(fp, "no elements: %s\n", url))
|
||||
p(fp.Close())
|
||||
p(os.Chtimes(filename+".err", timestamp, timestamp))
|
||||
|
||||
fp, err = os.Create(filename + ".html")
|
||||
p(err)
|
||||
p(fp.Write(body))
|
||||
p(fp.Close())
|
||||
p(os.Chtimes(filename+".html", timestamp, timestamp))
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
found = false
|
||||
for _, el := range ell {
|
||||
s := strings.TrimSpace(el.Content())
|
||||
if s != "" {
|
||||
pars = append(pars, s)
|
||||
found = true
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
_ = w(fmt.Errorf("no text, skipping: %s", url))
|
||||
fp, err := os.Create(filename + ".skip")
|
||||
p(fp.WriteString(url + "\n"))
|
||||
p(err)
|
||||
p(os.Chtimes(filename+".skip", timestamp, timestamp))
|
||||
|
||||
fp, err = os.Create(filename + ".html")
|
||||
p(err)
|
||||
p(fp.Write(body))
|
||||
p(fp.Close())
|
||||
p(os.Chtimes(filename+".html", timestamp, timestamp))
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
fp, err := os.Create(filename + ".txt")
|
||||
p(err)
|
||||
|
||||
if len(tags) == 0 {
|
||||
p(fmt.Fprintln(fp, "##META text tag ="))
|
||||
} else {
|
||||
for _, tag := range tags {
|
||||
p(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(tag)))
|
||||
}
|
||||
}
|
||||
|
||||
p(fp.WriteString(u.AddEnd(u.FixSpace(title))))
|
||||
|
||||
for _, par := range pars {
|
||||
p(fp.WriteString(u.AddEnd(u.FixSpace(par))))
|
||||
}
|
||||
|
||||
p(fp.Close())
|
||||
|
||||
p(os.Chtimes(filename+".txt", timestamp, timestamp))
|
||||
|
||||
return true
|
||||
}
|
||||
70
Volkskrant/txt2corpus.sh
Executable file
70
Volkskrant/txt2corpus.sh
Executable file
@@ -0,0 +1,70 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
BASE=/net/corpora/nlnieuws
|
||||
PART=$BASE/Volkskrant
|
||||
|
||||
unset CDPATH
|
||||
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
|
||||
export TZ=Europe/Amsterdam
|
||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||
|
||||
if [ "$1" = "" ]
|
||||
then
|
||||
ds=`date -d -2days +%Y-%m-%d`
|
||||
else
|
||||
case "$1" in
|
||||
2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9])
|
||||
ds=$1
|
||||
;;
|
||||
*)
|
||||
echo INVALID
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
fi
|
||||
|
||||
dp=${ds//-//}
|
||||
year=${ds%%-*}
|
||||
corpus=$PART/corpus/$year/$ds
|
||||
mkdir -p $PART/corpus/$year
|
||||
|
||||
cd $PART/$dp
|
||||
|
||||
ln -s lock.$$ lock
|
||||
if [ "`readlink lock`" != lock.$$ ]
|
||||
then
|
||||
echo Getting lock failed
|
||||
exit 1
|
||||
fi
|
||||
|
||||
rm -fr out
|
||||
mkdir out
|
||||
|
||||
rm -f $corpus.lines
|
||||
for i in *.txt
|
||||
do
|
||||
b=`basename $i .txt`
|
||||
perl -p -e 's/^\s*//; s/^##META.*\n//' $i | tokenize.sh \
|
||||
| perl -e '$n = 0; while(<>) { $n++; print("vk.'$b'.$n|$_"); }' \
|
||||
>> $corpus.lines
|
||||
done
|
||||
|
||||
cd out
|
||||
mkdir xml
|
||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||
|
||||
metadata
|
||||
|
||||
cd xml
|
||||
rm -f $corpus.data.dz $corpus.index
|
||||
alto -q -o $corpus.data.dz *.xml
|
||||
|
||||
# telling per bericht, niet per zin
|
||||
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
|
||||
cd ../..
|
||||
rm -fr out
|
||||
|
||||
rm -f lock
|
||||
@@ -7,23 +7,26 @@ import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type Data struct {
|
||||
Year int `json:"year"`
|
||||
Week int `json:"week"`
|
||||
First string `json:"first"`
|
||||
Last string `json:"last"`
|
||||
Period int `json:"period"`
|
||||
Start string `json:"start"`
|
||||
Algemeen *Parts `json:"Algemeen"`
|
||||
Groningen *Parts `json:"Groningen"`
|
||||
Amsterdam *Parts `json:"Amsterdam"`
|
||||
Literatuur *Parts `json:"Literatuur"`
|
||||
Vlaanderen *Parts `json:"Vlaanderen"`
|
||||
Year int `json:"year"`
|
||||
Week int `json:"week"`
|
||||
First string `json:"first"`
|
||||
Last string `json:"last"`
|
||||
Period int `json:"period"`
|
||||
Start string `json:"start"`
|
||||
Max int `json:"max"`
|
||||
Sources map[string]int `json:"sources"`
|
||||
Algemeen *Parts `json:"Algemeen"`
|
||||
Groningen *Parts `json:"Groningen"`
|
||||
Amsterdam *Parts `json:"Amsterdam"`
|
||||
Literatuur *Parts `json:"Literatuur"`
|
||||
Vlaanderen *Parts `json:"Vlaanderen"`
|
||||
}
|
||||
|
||||
type Parts struct {
|
||||
@@ -47,13 +50,14 @@ var (
|
||||
parts = map[string]struct {
|
||||
file string
|
||||
suffix string
|
||||
re *regexp.Regexp
|
||||
}{
|
||||
"nieuwe namen": {"nieuwe-namen", ".t20"},
|
||||
"nieuwe woorden": {"nieuwe-woorden-extra", ".t20"},
|
||||
"personen": {"personen", ""},
|
||||
"andere namen": {"overige-namen", ""},
|
||||
"locaties": {"locaties", ""},
|
||||
"organisaties": {"organisaties", ""},
|
||||
"nieuwe namen": {"nieuwe-namen", ".t20", nil},
|
||||
"nieuwe woorden": {"nieuwe-woorden-extra", ".t20", nil},
|
||||
"personen": {"personen", "", nil},
|
||||
"andere namen": {"overige-namen", "", nil},
|
||||
"locaties": {"locaties", "", nil},
|
||||
"organisaties": {"organisaties", "", regexp.MustCompile(`^(ANP|AT5)`)},
|
||||
}
|
||||
|
||||
maanden = strings.Fields("x januari februari maart april mei juni juli augustus september oktober november december")
|
||||
@@ -68,9 +72,9 @@ var (
|
||||
|
||||
func main() {
|
||||
|
||||
aa := strings.Split(os.Args[1], "-")
|
||||
aa := strings.Split(os.Args[1], ".")
|
||||
if len(aa) != 2 {
|
||||
x(fmt.Errorf("ongeldig argument, moet in formaat yyyy-dd zijn"))
|
||||
x(fmt.Errorf("ongeldig argument, moet in formaat yyyy.dd zijn"))
|
||||
}
|
||||
|
||||
var err error
|
||||
@@ -88,8 +92,9 @@ func main() {
|
||||
x(fmt.Errorf("ongeldige week: %d", week))
|
||||
}
|
||||
|
||||
start, first, last := dates()
|
||||
start, first, last, names := dates()
|
||||
|
||||
max, sources := makeCounts(names)
|
||||
data := &Data{
|
||||
Year: year,
|
||||
Week: week,
|
||||
@@ -97,6 +102,8 @@ func main() {
|
||||
Last: last,
|
||||
Period: size,
|
||||
Start: start,
|
||||
Max: max,
|
||||
Sources: sources,
|
||||
Algemeen: makeParts("Algemeen"),
|
||||
Groningen: makeParts("Groningen"),
|
||||
Amsterdam: makeParts("Amsterdam"),
|
||||
@@ -107,7 +114,6 @@ func main() {
|
||||
b, err := json.Marshal(data)
|
||||
x(err)
|
||||
fmt.Println(string(b))
|
||||
|
||||
}
|
||||
|
||||
func makeParts(source string) *Parts {
|
||||
@@ -124,7 +130,8 @@ func makeParts(source string) *Parts {
|
||||
func makeValues(source, part string) [][5]any {
|
||||
v := make([][5]any, 0)
|
||||
|
||||
filename := fmt.Sprintf("/net/corpora/nlnieuws/data/%s-%s-%d-%02d-%d%s",
|
||||
filename := fmt.Sprintf("/net/corpora/nlnieuws/data/%d/%s-%s-%d.%02d-%d%s",
|
||||
year,
|
||||
sources[source],
|
||||
parts[part].file,
|
||||
year,
|
||||
@@ -137,12 +144,15 @@ func makeValues(source, part string) [][5]any {
|
||||
scanner := bufio.NewScanner(fp)
|
||||
lineno := 0
|
||||
for scanner.Scan() {
|
||||
lineno++
|
||||
line := scanner.Text()
|
||||
aa := strings.Split(line, "\t")
|
||||
count, err := strconv.Atoi(strings.TrimSpace(aa[0]))
|
||||
x(err)
|
||||
word := aa[1]
|
||||
if parts[part].re != nil && parts[part].re.MatchString(word) {
|
||||
continue
|
||||
}
|
||||
lineno++
|
||||
var tags, lemma, postag string
|
||||
if len(aa) > 2 {
|
||||
tags = aa[2]
|
||||
@@ -163,7 +173,43 @@ func makeValues(source, part string) [][5]any {
|
||||
return v
|
||||
}
|
||||
|
||||
func dates() (start, first, last string) {
|
||||
func makeCounts(names []string) (int, map[string]int) {
|
||||
max := 0
|
||||
counts := make(map[string]int)
|
||||
x(os.Chdir("/net/corpora/nlnieuws"))
|
||||
files, err := os.ReadDir(".")
|
||||
x(err)
|
||||
for _, file := range files {
|
||||
if !file.IsDir() {
|
||||
continue
|
||||
}
|
||||
filename := file.Name()
|
||||
if filename[0] < 'A' || filename[0] > 'Z' {
|
||||
continue
|
||||
}
|
||||
count := 0
|
||||
for _, name := range names {
|
||||
files2, err := os.ReadDir(filename + "/" + name)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
for _, f := range files2 {
|
||||
if n := f.Name(); strings.HasSuffix(n, ".xml") {
|
||||
count++
|
||||
} else if strings.HasSuffix(n, ".skip") {
|
||||
count--
|
||||
}
|
||||
}
|
||||
}
|
||||
counts[filename] = count
|
||||
if count > max {
|
||||
max = count
|
||||
}
|
||||
}
|
||||
return max, counts
|
||||
}
|
||||
|
||||
func dates() (start, first, last string, names []string) {
|
||||
|
||||
// 1 januari
|
||||
t := time.Date(year, 1, 1, 12, 0, 0, 0, time.UTC)
|
||||
@@ -182,7 +228,20 @@ func dates() (start, first, last string) {
|
||||
t2 := t.AddDate(0, 0, 6)
|
||||
tStart := t.AddDate(0, 0, (1-size)*7)
|
||||
|
||||
return makeDate(tStart), makeDate(t), makeDate(t2)
|
||||
names = make([]string, 0)
|
||||
t3 := tStart
|
||||
for range size {
|
||||
y, w := t3.ISOWeek()
|
||||
names = append(names, fmt.Sprintf("%d/w%02d", y, w))
|
||||
t3 = t3.AddDate(0, 0, 7)
|
||||
}
|
||||
t3 = tStart
|
||||
for range 7 * size {
|
||||
names = append(names, fmt.Sprintf("%d/%02d/%02d", t3.Year(), t3.Month(), t3.Day()))
|
||||
t3 = t3.AddDate(0, 0, 1)
|
||||
}
|
||||
|
||||
return makeDate(tStart), makeDate(t), makeDate(t2), names
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -26,13 +26,24 @@ var (
|
||||
|
||||
func main() {
|
||||
|
||||
files, err := os.ReadDir("/net/corpora/nlnieuws/data/json")
|
||||
dirs, err := os.ReadDir("/net/corpora/nlnieuws/data/json")
|
||||
x(err)
|
||||
for _, dir := range dirs {
|
||||
if !dir.IsDir() {
|
||||
continue
|
||||
}
|
||||
dirname := dir.Name()
|
||||
if dirname[0] != '2' {
|
||||
continue
|
||||
}
|
||||
files, err := os.ReadDir("/net/corpora/nlnieuws/data/json/" + dirname)
|
||||
x(err)
|
||||
|
||||
for _, file := range files {
|
||||
filename := file.Name()
|
||||
if strings.HasPrefix(filename, "DATA-") && strings.HasSuffix(filename, "-4.json") {
|
||||
addWeek(filename[5:12])
|
||||
for _, file := range files {
|
||||
filename := file.Name()
|
||||
if strings.HasPrefix(filename, "DATA-") && strings.HasSuffix(filename, "-4.json") {
|
||||
addWeek(filename[5:12])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
63
cmd/rang/rang.go
Normal file
63
cmd/rang/rang.go
Normal file
@@ -0,0 +1,63 @@
|
||||
package main
|
||||
|
||||
// alto 'fp://node[....]' 'tt:%w\t%I' $files | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq | rang
|
||||
|
||||
import (
|
||||
e "codeberg.org/pebbe/errors"
|
||||
|
||||
"bufio"
|
||||
"fmt"
|
||||
"os"
|
||||
"sort"
|
||||
"strings"
|
||||
)
|
||||
|
||||
var (
|
||||
x = e.ExitErr
|
||||
)
|
||||
|
||||
type Item struct {
|
||||
word string
|
||||
count int
|
||||
}
|
||||
|
||||
func main() {
|
||||
|
||||
counts := make(map[string]int)
|
||||
|
||||
scanner := bufio.NewScanner(os.Stdin)
|
||||
for scanner.Scan() {
|
||||
word := strings.Split(scanner.Text(), "\t")[0]
|
||||
counts[word] = counts[word] + 1
|
||||
}
|
||||
x(scanner.Err())
|
||||
|
||||
items := make([]Item, 0)
|
||||
for key, value := range counts {
|
||||
items = append(items, Item{
|
||||
word: key,
|
||||
count: value,
|
||||
})
|
||||
}
|
||||
|
||||
sort.Slice(items, func(a, b int) bool {
|
||||
if items[a].count == items[b].count {
|
||||
return items[a].word < items[b].word
|
||||
}
|
||||
return items[a].count > items[b].count
|
||||
})
|
||||
|
||||
rang := 0
|
||||
prev := 0
|
||||
for _, item := range items {
|
||||
if item.count < 2 {
|
||||
break
|
||||
}
|
||||
if item.count != prev {
|
||||
rang++
|
||||
prev = item.count
|
||||
}
|
||||
fmt.Printf("%d\t%s\n", rang, item.word)
|
||||
}
|
||||
|
||||
}
|
||||
@@ -1,114 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
e "codeberg.org/pebbe/errors"
|
||||
|
||||
"bufio"
|
||||
"fmt"
|
||||
"os"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type Item struct {
|
||||
text string
|
||||
lctext string
|
||||
score int
|
||||
isnew bool
|
||||
}
|
||||
|
||||
var (
|
||||
x = e.ExitErr
|
||||
reYearWeek = regexp.MustCompile(`(.*)([12][0-9][0-9][0-9]-[0-5][0-9])(.*)`)
|
||||
count = make(map[string]int)
|
||||
items = make([]Item, 0)
|
||||
)
|
||||
|
||||
func main() {
|
||||
filename := os.Args[1]
|
||||
prevname := getPrev(filename)
|
||||
|
||||
fp, err := os.Open(prevname)
|
||||
x(err)
|
||||
scanner := bufio.NewScanner(fp)
|
||||
for scanner.Scan() {
|
||||
aa := strings.SplitN(scanner.Text(), "\t", 2)
|
||||
n, err := strconv.Atoi(strings.TrimSpace(aa[0]))
|
||||
x(err)
|
||||
count[aa[1]] = n
|
||||
}
|
||||
x(scanner.Err())
|
||||
x(fp.Close())
|
||||
|
||||
fp, err = os.Open(filename)
|
||||
x(err)
|
||||
scanner = bufio.NewScanner(fp)
|
||||
for scanner.Scan() {
|
||||
aa := strings.SplitN(scanner.Text(), "\t", 2)
|
||||
n, err := strconv.Atoi(strings.TrimSpace(aa[0]))
|
||||
x(err)
|
||||
n1, ok := count[aa[1]]
|
||||
items = append(items, Item{
|
||||
text: aa[1],
|
||||
lctext: strings.ToLower(aa[1]),
|
||||
score: n - n1,
|
||||
isnew: !ok,
|
||||
})
|
||||
}
|
||||
x(scanner.Err())
|
||||
x(fp.Close())
|
||||
|
||||
sort.Slice(items, func(i, j int) bool {
|
||||
/*
|
||||
if items[i].isnew && !items[j].isnew {
|
||||
return true
|
||||
}
|
||||
if !items[i].isnew && items[j].isnew {
|
||||
return false
|
||||
}
|
||||
*/
|
||||
if items[i].score != items[j].score {
|
||||
return items[i].score > items[j].score
|
||||
}
|
||||
return items[i].lctext < items[j].lctext
|
||||
})
|
||||
|
||||
for _, item := range items {
|
||||
/*
|
||||
if item.score < 2 {
|
||||
break
|
||||
}
|
||||
*/
|
||||
p := "."
|
||||
if item.isnew {
|
||||
p = "N"
|
||||
}
|
||||
fmt.Printf("%s\t%4d\t%s\n", p, item.score, item.text)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
func getPrev(filename string) string {
|
||||
mm := reYearWeek.FindStringSubmatch(filename)
|
||||
year, err := strconv.Atoi(mm[2][:4])
|
||||
x(err)
|
||||
week, err := strconv.Atoi(mm[2][5:])
|
||||
x(err)
|
||||
|
||||
week--
|
||||
if week == 0 {
|
||||
week = 53
|
||||
year--
|
||||
}
|
||||
newname := fmt.Sprintf("%s%d-%02d%s", mm[1], year, week, mm[3])
|
||||
if week == 53 {
|
||||
_, err := os.Stat(newname)
|
||||
if err == nil {
|
||||
return newname
|
||||
}
|
||||
newname = fmt.Sprintf("%s%d-%02d%s", mm[1], year, week-1, mm[3])
|
||||
}
|
||||
return newname
|
||||
}
|
||||
@@ -11,7 +11,7 @@ import (
|
||||
|
||||
var (
|
||||
x = e.ExitErr
|
||||
reFile = regexp.MustCompile(`(.*)(2[0-9][0-9][0-9]-[0-5][0-9])(.*)`)
|
||||
reFile = regexp.MustCompile(`(.*)(2[0-9][0-9][0-9]\.[0-5][0-9])(.*)`)
|
||||
seen = make(map[string]bool)
|
||||
)
|
||||
|
||||
@@ -23,21 +23,30 @@ func main() {
|
||||
suffix := m[3] + ".t20"
|
||||
target := infile + ".t20"
|
||||
|
||||
x(os.Chdir("/net/corpora/nlnieuws/data"))
|
||||
|
||||
files, err := os.ReadDir(".")
|
||||
dirs, err := os.ReadDir("..")
|
||||
x(err)
|
||||
for _, file := range files {
|
||||
name := file.Name()
|
||||
if strings.HasPrefix(name, prefix) && strings.HasSuffix(name, suffix) && name < target {
|
||||
fp, err := os.Open(name)
|
||||
x(err)
|
||||
scanner := bufio.NewScanner(fp)
|
||||
for scanner.Scan() {
|
||||
seen[strings.Split(scanner.Text(), "\t")[1]] = true
|
||||
for _, dir := range dirs {
|
||||
if !dir.IsDir() {
|
||||
continue
|
||||
}
|
||||
dirname := dir.Name()
|
||||
if dirname[0] != '2' {
|
||||
continue
|
||||
}
|
||||
files, err := os.ReadDir("../" + dirname)
|
||||
x(err)
|
||||
for _, file := range files {
|
||||
name := file.Name()
|
||||
if strings.HasPrefix(name, prefix) && strings.HasSuffix(name, suffix) && name < target {
|
||||
fp, err := os.Open("../" + dirname + "/" + name)
|
||||
x(err)
|
||||
scanner := bufio.NewScanner(fp)
|
||||
for scanner.Scan() {
|
||||
seen[strings.Split(scanner.Text(), "\t")[1]] = true
|
||||
}
|
||||
x(scanner.Err())
|
||||
x(fp.Close())
|
||||
}
|
||||
x(scanner.Err())
|
||||
x(fp.Close())
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
108
cmd/trends/trends.go
Normal file
108
cmd/trends/trends.go
Normal file
@@ -0,0 +1,108 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
e "codeberg.org/pebbe/errors"
|
||||
|
||||
"bufio"
|
||||
"fmt"
|
||||
"os"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type Item struct {
|
||||
word string
|
||||
diff float64
|
||||
gone bool
|
||||
}
|
||||
|
||||
var (
|
||||
x = e.ExitErr
|
||||
)
|
||||
|
||||
func main() {
|
||||
|
||||
refs := make(map[string]int)
|
||||
refmax := 0
|
||||
fp, err := os.Open(os.Args[1])
|
||||
x(err)
|
||||
scanner := bufio.NewScanner(fp)
|
||||
for scanner.Scan() {
|
||||
aa := strings.Split(scanner.Text(), "\t")
|
||||
n, err := strconv.Atoi(aa[0])
|
||||
x(err)
|
||||
refs[aa[1]] = n
|
||||
if n > refmax {
|
||||
refmax = n
|
||||
}
|
||||
}
|
||||
x(scanner.Err())
|
||||
fp.Close()
|
||||
refmax++
|
||||
|
||||
lines := make([]string, 0)
|
||||
fp, err = os.Open(os.Args[2])
|
||||
x(err)
|
||||
scanner = bufio.NewScanner(fp)
|
||||
for scanner.Scan() {
|
||||
lines = append(lines, scanner.Text())
|
||||
}
|
||||
x(scanner.Err())
|
||||
fp.Close()
|
||||
|
||||
curmax, err := strconv.Atoi(strings.Split(lines[len(lines)-1], "\t")[0])
|
||||
x(err)
|
||||
curmax++
|
||||
|
||||
items := make([]Item, 0)
|
||||
seen := make(map[string]bool)
|
||||
|
||||
for _, line := range lines {
|
||||
aa := strings.Split(line, "\t")
|
||||
seen[aa[1]] = true
|
||||
n, err := strconv.Atoi(aa[0])
|
||||
x(err)
|
||||
m, ok := refs[aa[1]]
|
||||
if !ok {
|
||||
//continue
|
||||
m = refmax
|
||||
}
|
||||
diff := float64(m)/float64(refmax) - float64(n)/float64(curmax)
|
||||
if diff > 0.05 || diff < -0.05 {
|
||||
items = append(items, Item{
|
||||
word: aa[1],
|
||||
diff: diff,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
for key, value := range refs {
|
||||
if !seen[key] {
|
||||
diff := float64(value)/float64(refmax) - 1.0
|
||||
if diff > 0.05 || diff < -0.05 {
|
||||
items = append(items, Item{
|
||||
word: key,
|
||||
diff: diff,
|
||||
gone: true,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sort.Slice(items, func(a, b int) bool {
|
||||
if items[a].diff == items[b].diff {
|
||||
return items[a].word < items[b].word
|
||||
}
|
||||
return items[a].diff > items[b].diff
|
||||
})
|
||||
|
||||
for _, item := range items {
|
||||
var s string
|
||||
if item.gone {
|
||||
s = "X"
|
||||
}
|
||||
fmt.Printf("%f\t%s\t%s\n", item.diff, s, item.word)
|
||||
}
|
||||
|
||||
}
|
||||
@@ -15,9 +15,9 @@ var (
|
||||
)
|
||||
|
||||
func main() {
|
||||
aa := strings.Split(os.Args[1], "-")
|
||||
aa := strings.Split(os.Args[1], ".")
|
||||
if len(aa) != 2 {
|
||||
x(fmt.Errorf("ongeldig argument, moet in formaat yyyy-dd zijn"))
|
||||
x(fmt.Errorf("ongeldig argument, moet in formaat yyyy.dd zijn"))
|
||||
}
|
||||
|
||||
year, err := strconv.Atoi(aa[0])
|
||||
@@ -58,7 +58,7 @@ func main() {
|
||||
fmt.Print(" -or")
|
||||
}
|
||||
y, w := t2.ISOWeek()
|
||||
fmt.Printf(" -name %d-%02d.data.dz", y, w)
|
||||
fmt.Printf(" -name %d.%02d.data.dz", y, w)
|
||||
t2 = t2.AddDate(0, 0, 7)
|
||||
}
|
||||
|
||||
|
||||
39
collect.sh
39
collect.sh
@@ -22,10 +22,10 @@ say () {
|
||||
|
||||
if [ "$1" = "" ]
|
||||
then
|
||||
ds=`date -d -7days +%G-%V`
|
||||
ds=`date -d -7days +%G.%V`
|
||||
else
|
||||
case "$1" in
|
||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
||||
2[0-9][0-9][0-9].[0-5][0-9])
|
||||
ds=$1
|
||||
;;
|
||||
*)
|
||||
@@ -35,19 +35,24 @@ else
|
||||
esac
|
||||
fi
|
||||
|
||||
cd /net/corpora/nlnieuws/data
|
||||
year=${ds%%.*}
|
||||
|
||||
mkdir -p /net/corpora/nlnieuws/data/$year
|
||||
mkdir -p /net/corpora/nlnieuws/data/json/$year
|
||||
cd /net/corpora/nlnieuws/data/$year
|
||||
|
||||
declare -A parts
|
||||
#parts[alles]='.'
|
||||
parts[algemeen]='NOS|NU|NieuwsNL|RO|Sargasso'
|
||||
parts[algemeen]='NOS|NU|NieuwsNL|RO|Sargasso|Volkskrant'
|
||||
parts[amsterdam]='AT5|BuurtAdam|Parool'
|
||||
parts[groningen]='BuurtGrn|GG|Oog|RTVNoord|Sikkom'
|
||||
parts[literatuur]='LitNL|Tzum'
|
||||
parts[vlaanderen]='VRT'
|
||||
parts[vlaanderen]='HLN|VRT'
|
||||
#parts[AT5]='AT5'
|
||||
#parts[BuurtAdam]='BuurtAdam'
|
||||
#parts[BuurtGrn]='BuurtGrn'
|
||||
#parts[GG]='GG'
|
||||
#parts[HLN]='HLN'
|
||||
#parts[LitNL]='LitNL'
|
||||
#parts[NOS]='NOS'
|
||||
#parts[NU]='NU'
|
||||
@@ -59,6 +64,7 @@ parts[vlaanderen]='VRT'
|
||||
#parts[Sargasso]='Sargasso'
|
||||
#parts[Sikkom]='Sikkom'
|
||||
#parts[Tzum]='Tzum'
|
||||
#parts[Volkskrant]='Volkskrant'
|
||||
#parts[VRT]='VRT'
|
||||
|
||||
for part in ${!parts[@]}
|
||||
@@ -67,7 +73,7 @@ do
|
||||
|
||||
for i in 1 4
|
||||
do
|
||||
files=$(find .. $(week2files $ds $i) | grep -E "$regex") || true
|
||||
files=$(find ../.. $(week2files $ds $i) | grep -E "$regex") || true
|
||||
if [ -z "$files" ]
|
||||
then
|
||||
continue
|
||||
@@ -157,10 +163,23 @@ do
|
||||
| sed -e 's/\([0-9]\) */\1\t/' | sort -f -k 2 | sort -n -r -k 1,1 -s \
|
||||
> $part-nieuwe-adjww-extra-$ds-$i
|
||||
|
||||
# ranglijsten
|
||||
|
||||
say $part-rang-$ds-$i
|
||||
alto \
|
||||
'fp://node[((@pt="n" or @neclass) and not(@rel="mwp")) or (@cat="mwu" and .//node[@neclass])]' \
|
||||
'tt:%w\t%I' $files \
|
||||
| sed -e 's/\.[0-9][0-9]*$//' | sort | uniq | rang \
|
||||
> $part-rang-$ds-$i
|
||||
|
||||
done
|
||||
done
|
||||
|
||||
data2json $ds 1 > json/DATA-$ds-1.json
|
||||
data2json $ds 4 > json/DATA-$ds-4.json
|
||||
dates2json > json/index1.json
|
||||
dates2json > json/index4.json
|
||||
data2json $ds 1 > ../json/$year/DATA-$ds-1.json
|
||||
data2json $ds 4 > ../json/$year/DATA-$ds-4.json
|
||||
dates2json > ../json/index1.json
|
||||
dates2json > ../json/index4.json
|
||||
|
||||
# rechten bijwerken
|
||||
chmod -R g+w /net/corpora/nlnieuws
|
||||
chgrp -R software /net/corpora/nlnieuws
|
||||
|
||||
4
go.mod
4
go.mod
@@ -1,11 +1,13 @@
|
||||
module nlnieuws
|
||||
module git.web.rug.nl/p209327/nlnieuws
|
||||
|
||||
go 1.26.1
|
||||
|
||||
require (
|
||||
codeberg.org/pebbe/errors v0.4.0
|
||||
github.com/jbowtie/gokogiri v0.0.0-20250107075044-de0f9d4877a5
|
||||
github.com/pebbe/compactcorpus v1.0.3
|
||||
github.com/pebbe/textcat/v2 v2.3.0
|
||||
github.com/rug-compling/alpinods v1.18.1
|
||||
)
|
||||
|
||||
require github.com/pebbe/util v0.9.0 // indirect
|
||||
|
||||
4
go.sum
4
go.sum
@@ -2,7 +2,11 @@ codeberg.org/pebbe/errors v0.4.0 h1:G05wsXpC/LRPaL02QYDwtz0sWFWQcIWK1s+MC79LBzU=
|
||||
codeberg.org/pebbe/errors v0.4.0/go.mod h1:O7PPxUJM1bWRHq11CRK3wqVaH/3NnRaSVZvh3UhzDCY=
|
||||
github.com/jbowtie/gokogiri v0.0.0-20250107075044-de0f9d4877a5 h1:tQbR4RKFBFi0+Ll69dXejKKUbQVNaOAT2fjlDvSAfx4=
|
||||
github.com/jbowtie/gokogiri v0.0.0-20250107075044-de0f9d4877a5/go.mod h1:kQE2lxPgVKe0JsBZMFFfMm5zBDCuRhaHFKOBzZeCLiw=
|
||||
github.com/pebbe/compactcorpus v1.0.3 h1:6qlfXKHTKg7oWKLPCgEgv1scplfvphg/9l9XiRT2HzQ=
|
||||
github.com/pebbe/compactcorpus v1.0.3/go.mod h1:SSpTeCZataCjjs82RJb8SOGdjkB3PlR7Z19EY4rInoQ=
|
||||
github.com/pebbe/textcat/v2 v2.3.0 h1:RB2egIQgI2a2Ls+I9No6KFQKCZBIFt8Cc/SWCnVtC7Y=
|
||||
github.com/pebbe/textcat/v2 v2.3.0/go.mod h1:WLXWuL+fOlQJqn6LmubjD+e78hCC6Y/rAWInh0wq/kg=
|
||||
github.com/pebbe/util v0.9.0 h1:PMZd+CpWb8GbWEmFGlL3qd6XPuywl6xFIbrXWi870OA=
|
||||
github.com/pebbe/util v0.9.0/go.mod h1:ynWl/SFX4+Seb9fpjVlYevr1f4TP7FrCmyZHiBCg69Q=
|
||||
github.com/rug-compling/alpinods v1.18.1 h1:BvPcCnNEQ1QoVSc0RmwJd3kZmvo4iqZ52/vFzVvFS7w=
|
||||
github.com/rug-compling/alpinods v1.18.1/go.mod h1:R3BBX8RIw9InVqHZ+1W+MsX8WX8uBkoVNNGE38mqF1Q=
|
||||
|
||||
74
internal/util/util.go
Normal file
74
internal/util/util.go
Normal file
@@ -0,0 +1,74 @@
|
||||
package util
|
||||
|
||||
import (
|
||||
e "codeberg.org/pebbe/errors"
|
||||
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"strings"
|
||||
)
|
||||
|
||||
var (
|
||||
p = e.PanicErr
|
||||
reEOL = regexp.MustCompile(`[.!?]['"”’]?$`)
|
||||
reNEOL = regexp.MustCompile(`[.!?]['"”’]?\p{Lu}\p{Ll}+\.?`)
|
||||
reLET = regexp.MustCompile(`\p{Lu}`)
|
||||
reBody = regexp.MustCompile(`<[bB][rR][ /]*>`)
|
||||
reQuotLeft = regexp.MustCompile(`<em>|<i>`)
|
||||
reQuotRight = regexp.MustCompile(`</em>|</i>`)
|
||||
)
|
||||
|
||||
func HtmlFix(html []byte) []byte {
|
||||
html = reQuotLeft.ReplaceAllLiteral(html, []byte(" „"))
|
||||
html = reQuotRight.ReplaceAllLiteral(html, []byte("” "))
|
||||
return reBody.ReplaceAllLiteral(html, []byte(" "))
|
||||
}
|
||||
|
||||
func HtmlFixString(html string) string {
|
||||
html = reQuotLeft.ReplaceAllLiteralString(html, " „")
|
||||
html = reQuotRight.ReplaceAllLiteralString(html, "” ")
|
||||
return reBody.ReplaceAllLiteralString(html, " ")
|
||||
}
|
||||
|
||||
func AddEnd(s string) string {
|
||||
s = strings.TrimSpace(s)
|
||||
if s == "" {
|
||||
return ""
|
||||
}
|
||||
if reEOL.MatchString(s) {
|
||||
return s + "\n"
|
||||
}
|
||||
return s + ".\n"
|
||||
}
|
||||
|
||||
func FixSpace(s string, opt ...bool) string {
|
||||
s = strings.Join(strings.Fields(s), " ")
|
||||
|
||||
if len(opt) > 0 && opt[0] {
|
||||
s = reNEOL.ReplaceAllStringFunc(s, func(s1 string) string {
|
||||
if strings.HasSuffix(s1, ".") {
|
||||
// zoals: v.Chr.
|
||||
return s1
|
||||
}
|
||||
i := reLET.FindStringIndex(s1)[0]
|
||||
return s1[:i] + " " + s1[i:]
|
||||
})
|
||||
}
|
||||
|
||||
return s
|
||||
}
|
||||
|
||||
func MkLock(filename string) {
|
||||
pid := os.Getpid()
|
||||
link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
|
||||
p(os.Symlink(link, filename))
|
||||
|
||||
name, err := os.Readlink(filename)
|
||||
p(err)
|
||||
|
||||
if name != link {
|
||||
p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
|
||||
}
|
||||
}
|
||||
66
oud/fix.go
Normal file
66
oud/fix.go
Normal file
@@ -0,0 +1,66 @@
|
||||
package main
|
||||
|
||||
/*
|
||||
Dit past corpora aan
|
||||
|
||||
Tags verwijderen:
|
||||
|
||||
Oog: Nieuws
|
||||
Parool: Nieuws
|
||||
RO: Artikelen, cafeyn
|
||||
RTVNoord: br_*
|
||||
Tzum: Nieuws
|
||||
|
||||
Tags veranderen:
|
||||
|
||||
RTVNoord: tr_* → *
|
||||
|
||||
*/
|
||||
|
||||
import (
|
||||
e "codeberg.org/pebbe/errors"
|
||||
cc "github.com/pebbe/compactcorpus"
|
||||
"github.com/rug-compling/alpinods"
|
||||
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
)
|
||||
|
||||
var (
|
||||
x = e.ExitErr
|
||||
)
|
||||
|
||||
func main() {
|
||||
for _, file := range os.Args[1:] {
|
||||
base := strings.TrimSuffix(file, ".data.dz")
|
||||
newfile := base + "-new.data.dz"
|
||||
|
||||
incc, err := cc.Open(file)
|
||||
x(err)
|
||||
outcc, err := cc.NewCorpus(newfile)
|
||||
x(err)
|
||||
r, err := incc.NewRange()
|
||||
x(err)
|
||||
for r.HasNext() {
|
||||
name, data := r.Next()
|
||||
fmt.Printf("%s %s \r", base, name)
|
||||
var alpino alpinods.AlpinoDS
|
||||
x(xml.Unmarshal(data, &alpino))
|
||||
for i := 0; i < len(alpino.Metadata.Meta); i++ {
|
||||
if alpino.Metadata.Meta[i].Name != "tag" {
|
||||
continue
|
||||
}
|
||||
if n := alpino.Metadata.Meta[i].Value; n == "Nieuws" || n == "Artikelen" || n == "cafeyn" || strings.HasPrefix(n, "br_") {
|
||||
alpino.Metadata.Meta = append(alpino.Metadata.Meta[:i], alpino.Metadata.Meta[i+1:]...)
|
||||
i--
|
||||
} else if strings.HasPrefix(n, "tr_") {
|
||||
alpino.Metadata.Meta[i].Value = n[3:]
|
||||
}
|
||||
}
|
||||
outcc.Write(name, []byte(alpino.String()))
|
||||
}
|
||||
x(outcc.Close())
|
||||
}
|
||||
}
|
||||
@@ -15,7 +15,7 @@ gebruik:
|
||||
1 : nieuwe namen
|
||||
2 : nieuwe woorden
|
||||
3 : nieuwe woorden met postag en lemma
|
||||
4 : bestaaande locaties
|
||||
4 : bestaande locaties
|
||||
5 : bestaande personen
|
||||
6 : bestaande organisaties
|
||||
7 : bestaande andere namen
|
||||
@@ -1,232 +0,0 @@
|
||||
Vragen:
|
||||
|
||||
- hoe data range selecteren (bv alles van maart 2026)
|
||||
|
||||
- website met lijstjes top-N (20?)
|
||||
- nieuwe namen
|
||||
- wel of niet onderverdelen naar categorie?
|
||||
- nieuwe woorden
|
||||
- met postag
|
||||
- bestaande namen
|
||||
- personen
|
||||
- plaatsen
|
||||
- organisaties
|
||||
- misc
|
||||
|
||||
- queries worden nog beetje aangepast denk ik
|
||||
|
||||
"nieuw": nu: niet in Alpino, later (ook): niet in top-N van vorige maand.
|
||||
|
||||
|
||||
find /net/corpora/nlnieuws/ -name '*data.dz' | xargs alto fp:'//node[((@cat="mwu" and node[@pt="spec"]) or (@pt and @*="eigen" and not(@rel="mwp"))) and not(@his="normal") and not(@his_1="decap" or @his_1="0")]' tt:%w |sort | uniq -c |sort -nr | head -n 20
|
||||
|
||||
"nieuwe namen"
|
||||
|
||||
445 Straat van Hormuz
|
||||
433 Jetten
|
||||
309 AI
|
||||
301 Høiby
|
||||
250 Odido
|
||||
190 Zelensky
|
||||
174 Rob Jetten
|
||||
153 VRT NWS
|
||||
134 Jeffrey Epstein
|
||||
130 Anthropic
|
||||
125 Schulting
|
||||
115 GroenLinks-PvdA
|
||||
109 TikTok
|
||||
106 Xandra Velzeboer
|
||||
106 Kyiv
|
||||
106 JA21
|
||||
104 Starmer
|
||||
98 Marius Borg Høiby
|
||||
95 Revolutionaire Garde
|
||||
94 Jens van 't Wout
|
||||
|
||||
|
||||
"nieuwe woorden":
|
||||
|
||||
find /net/corpora/nlnieuws/ -name '*data.dz' | xargs alto fp:'//node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' tt:%w |sort | uniq -c |sort -nr |head -n 20
|
||||
|
||||
150 Trump-regering
|
||||
141 coalitieakkoord
|
||||
126 zeestraat
|
||||
122 Golfregio
|
||||
107 massastart
|
||||
96 Amerikaans-Israëlische
|
||||
92 ballistische
|
||||
90 datalek
|
||||
85 kabinet-Jetten
|
||||
82 lng
|
||||
74 droneaanval
|
||||
68 vergeldingsaanvallen
|
||||
61 tussenronde
|
||||
59 Iranoorlog
|
||||
58 vrijgave
|
||||
56 speelzand
|
||||
55 regering-Trump
|
||||
54 sprintrace
|
||||
54 ploegenachtervolging
|
||||
|
||||
liever met postag en lemma erbij:
|
||||
|
||||
find /net/corpora/nlnieuws/ -name '*data.dz' | xargs alto fp:'//node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' tt:"%w \t %l \t %P" |sort | uniq -c |sort -nr |head -n 20
|
||||
|
||||
150 Trump-regering Trump_regering N(soort,ev,basis,zijd,stan)
|
||||
141 coalitieakkoord coalitie_akkoord N(soort,ev,basis,onz,stan)
|
||||
126 zeestraat zee_straat N(soort,ev,basis,zijd,stan)
|
||||
121 Golfregio Golf_regio N(soort,ev,basis,zijd,stan)
|
||||
107 massastart massa_start N(soort,ev,basis,zijd,stan)
|
||||
96 Amerikaans-Israëlische Amerikaans_Israëlisch ADJ(prenom,basis,met-e,stan)
|
||||
90 datalek data_lek N(soort,ev,basis,onz,stan)
|
||||
90 ballistische ballistisch ADJ(prenom,basis,met-e,stan)
|
||||
82 lng lng N(soort,ev,basis,onz,stan)
|
||||
74 droneaanval drone_aanval N(soort,ev,basis,zijd,stan)
|
||||
72 kabinet-Jetten kabinet-Jetten N(soort,ev,basis,onz,stan)
|
||||
66 vergeldingsaanvallen vergelding_aanval N(soort,mv,basis)
|
||||
61 tussenronde tussen_ronde N(soort,ev,basis,zijd,stan)
|
||||
59 Iranoorlog Iran_oorlog N(soort,ev,basis,zijd,stan)
|
||||
56 speelzand speel_zand N(soort,ev,basis,onz,stan)
|
||||
55 regering-Trump regering_Trump N(soort,ev,basis,zijd,stan)
|
||||
54 vrijgave vrij_gave N(soort,ev,basis,zijd,stan)
|
||||
54 sprintrace sprint_race N(soort,ev,basis,zijd,stan)
|
||||
54 ploegenachtervolging ploeg_achtervolging N(soort,ev,basis,zijd,stan)
|
||||
53 staatsmedia staat_medium N(soort,mv,basis)
|
||||
|
||||
"bestaande locaties":
|
||||
|
||||
find /net/corpora/nlnieuws/ -name '*data.dz' | xargs alto fp:'//node[(@neclass="LOC" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="LOC"] and @his="normal")]' tt:%l |sort | uniq -c |sort -nr | head -n 20
|
||||
|
||||
|
||||
3910 Iran
|
||||
2180 Nederland
|
||||
1929 VS
|
||||
1610 Israël
|
||||
1218 Midden-Oosten
|
||||
1128 Oekraïne
|
||||
942 Verenigde Staten
|
||||
874 Rusland
|
||||
823 Amsterdam
|
||||
776 Europa
|
||||
668 DEN HAAG
|
||||
563 België
|
||||
555 China
|
||||
445 Milaan
|
||||
429 Frankrijk
|
||||
389 Duitsland
|
||||
380 Brussel
|
||||
374 Dubai
|
||||
368 Libanon
|
||||
364 Groningen
|
||||
|
||||
"bestaande personen":
|
||||
|
||||
find /net/corpora/nlnieuws/ -name '*data.dz' | xargs alto fp:'//node[(@neclass="PER" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="PER"] and @his="normal")]' tt:%l |sort | uniq -c |sort -nr | head -n 20
|
||||
|
||||
1812 Trump
|
||||
531 Donald Trump
|
||||
327 Khamenei
|
||||
309 Epstein
|
||||
267 Verstappen
|
||||
229 Andrew
|
||||
208 Máxima
|
||||
187 Ali Khamenei
|
||||
161 Orbán
|
||||
146 Trumps
|
||||
133 Mette-Marit
|
||||
133 Keijzer
|
||||
126 Willem-Alexander
|
||||
126 Kok
|
||||
122 Charles
|
||||
118 Stolz
|
||||
113 Harald
|
||||
111 Poetin
|
||||
97 Van Persie
|
||||
94 Wilders
|
||||
|
||||
|
||||
|
||||
"bestaande organisaties":
|
||||
|
||||
find /net/corpora/nlnieuws/ -name '*data.dz' | xargs alto fp:'//node[(@neclass="ORG" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="ORG"] and @his="normal")]' tt:%l |sort | uniq -c |sort -nr | head -n 20
|
||||
|
||||
|
||||
|
||||
2575 ANP
|
||||
547 Ajax
|
||||
449 Instagram
|
||||
421 EU
|
||||
357 Defensie
|
||||
349 Feyenoord
|
||||
348 D66
|
||||
346 VVD
|
||||
329 PSV
|
||||
305 Hezbollah
|
||||
303 Tweede Kamer
|
||||
303 NEC
|
||||
296 AZ
|
||||
265 CDA
|
||||
263 OM
|
||||
237 NU.nl
|
||||
232 NOS
|
||||
231 BBC
|
||||
224 Kamer
|
||||
219 Openbaar Ministerie
|
||||
|
||||
|
||||
"bestaande andere namen (boeken, films, events, .. )":
|
||||
|
||||
find /net/corpora/nlnieuws/ -name '*data.dz' | xargs alto fp:'//node[(@neclass="MISC" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="MISC"] and @his="normal")]' tt:%l |sort | uniq -c |sort -nr | head -n 20
|
||||
|
||||
|
||||
361 Spelen
|
||||
289 Olympische Spelen
|
||||
278 Eredivisie
|
||||
244 X
|
||||
222 Winterspelen
|
||||
177 Champions League
|
||||
147 Formule 1
|
||||
143 Premier League
|
||||
137 X.
|
||||
112 Oscars
|
||||
102 Grand Prix
|
||||
100 Paralympische Spelen
|
||||
90 Facebook
|
||||
78 Eurovisie Songfestival
|
||||
76 WhatsApp
|
||||
75 Parijs-Nice
|
||||
70 Tweede Wereldoorlog
|
||||
67 Oscar
|
||||
66 The New York Times
|
||||
62 AEX-index
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/* deze misschien niet? */
|
||||
"nieuwe adjectieven, deelwoorden en werkwoorden":
|
||||
|
||||
find /net/corpora/nlnieuws/ -name '*data.dz' | xargs alto fp:'//node[@pt and @his and not(../@his="normal" or @rel="mwp" or ../@his="name" or ../@his_1="decap") and not(@his="normal" or @his="name" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="decap" or @his="within_word_conjunct") and not(@pt="n") ]' tt:"%w %P" |sort | uniq -c |sort -nr |head -n 20
|
||||
|
||||
96 Amerikaans-Israëlische ADJ(prenom,basis,met-e,stan)
|
||||
90 ballistische ADJ(prenom,basis,met-e,stan)
|
||||
41 radicaal-rechtse ADJ(prenom,basis,met-e,stan)
|
||||
29 Israëlisch-Amerikaanse ADJ(prenom,basis,met-e,stan)
|
||||
27 pro-Iraanse ADJ(prenom,basis,met-e,stan)
|
||||
25 Belarussische ADJ(prenom,basis,met-e,stan)
|
||||
22 radicaal-linkse ADJ(prenom,basis,met-e,stan)
|
||||
21 Omaanse ADJ(prenom,basis,met-e,stan)
|
||||
19 pro-Palestijnse ADJ(prenom,basis,met-e,stan)
|
||||
16 partijloze ADJ(prenom,basis,met-e,stan)
|
||||
15 Eindhovense ADJ(prenom,basis,met-e,stan)
|
||||
14 cybercriminele ADJ(prenom,basis,met-e,stan)
|
||||
14 bestverkochte WW(vd,prenom,met-e)
|
||||
12 onbevestigde WW(vd,prenom,met-e)
|
||||
12 kindgebonden WW(vd,prenom,zonder)
|
||||
12 AI-gegenereerde WW(vd,prenom,met-e)
|
||||
11 toekomstbestendig ADJ(vrij,basis,zonder)
|
||||
11 omhooggegaan WW(vd,vrij,zonder)
|
||||
11 Iraans-Koerdische ADJ(prenom,basis,met-e,stan)
|
||||
11 antifascistische ADJ(prenom,basis,met-e,stan)
|
||||
@@ -1,3 +0,0 @@
|
||||
for i in ../data/*2026-14-4*; do ./top2html.py $i > `basename $i`.table; done
|
||||
for i in *t20*; do rm -f `basename $i .t20.table`.table; done
|
||||
|
||||
118
www/app.html
118
www/app.html
@@ -5,7 +5,7 @@
|
||||
<meta charset="utf-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||
<link rel="icon" href="favicon.ico" type="image/ico" />
|
||||
<link rel="stylesheet" href="style2.css" />
|
||||
<link rel="stylesheet" href="style.css" />
|
||||
<link rel="stylesheet" type="text/css" href="tooltip.css" />
|
||||
<script type="text/javascript" src="tooltip.js"></script>
|
||||
<script type="text/javascript" src="app.js" defer></script>
|
||||
@@ -74,7 +74,8 @@
|
||||
|
||||
<div class="option" id="week">
|
||||
week:
|
||||
<input type="date" id="fDate" name="date" step="7" />
|
||||
<input type="date" id="fDate" name="date" step="7" /><span
|
||||
class="validity"></span>
|
||||
</div>
|
||||
|
||||
<button type="button" onclick="kies()" id="fSubmit" disabled>
|
||||
@@ -86,5 +87,118 @@
|
||||
<h2 id="subtitle"></h2>
|
||||
</div>
|
||||
<div class="main" id="data"></div>
|
||||
<div class="foot">
|
||||
<h2>Bronnen</h2>
|
||||
<table class="bron">
|
||||
<tr>
|
||||
<td>Algemeen</td>
|
||||
<td class="bar"><div id="NieuwsNL" style="width: 100%"></div></td>
|
||||
<td><a href="https://nieuws.nl/">NieuwsNL</a></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td></td>
|
||||
<td class="bar"><div id="NOS"></div></td>
|
||||
<td><a href="https://nos.nl/">NOS</a></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td></td>
|
||||
<td class="bar"><div id="NU"></div></td>
|
||||
<td><a href="https://www.nu.nl/">NU</a></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td></td>
|
||||
<td class="bar"><div id="RO"></div></td>
|
||||
<td><a href="https://reportersonline.nl/">Reporters Online</a></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td></td>
|
||||
<td class="bar"><div id="Sargasso"></div></td>
|
||||
<td><a href="https://sargasso.nl/">Sargasso</a></td>
|
||||
</tr>
|
||||
<tr class="last">
|
||||
<td></td>
|
||||
<td class="bar"><div id="Volkskrant"></div></td>
|
||||
<td><a href="https://www.volkskrant.nl/">de Volkskrant</a></td>
|
||||
</tr>
|
||||
<tr class="first">
|
||||
<td>Amsterdam</td>
|
||||
<td class="bar"><div id="AT5"></div></td>
|
||||
<td><a href="https://www.at5.nl/">AT5</a></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td></td>
|
||||
<td class="bar"><div id="Parool"></div></td>
|
||||
<td>
|
||||
<a href="https://www.parool.nl/amsterdam/"
|
||||
>Het Parool | Amsterdam</a
|
||||
>
|
||||
</td>
|
||||
</tr>
|
||||
<tr class="last">
|
||||
<td></td>
|
||||
<td class="bar"><div id="BuurtAdam"></div></td>
|
||||
<td>
|
||||
<a href="https://indebuurt.nl/amsterdam/"
|
||||
>In de buurt | Amsterdam</a
|
||||
>
|
||||
</td>
|
||||
</tr>
|
||||
<tr class="first">
|
||||
<td>Groningen</td>
|
||||
<td class="bar"><div id="GG"></div></td>
|
||||
<td>
|
||||
<a href="https://gemeente.groningen.nl/nieuwsoverzicht"
|
||||
>Gemeente Groningen</a
|
||||
>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td></td>
|
||||
<td class="bar"><div id="BuurtGrn"></div></td>
|
||||
<td>
|
||||
<a href="https://indebuurt.nl/groningen/"
|
||||
>In de buurt | Groningen</a
|
||||
>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td></td>
|
||||
<td class="bar"><div id="Oog"></div></td>
|
||||
<td><a href="https://www.oogtv.nl/">Oog</a></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td></td>
|
||||
<td class="bar"><div id="RTVNoord"></div></td>
|
||||
<td><a href="https://www.rtvnoord.nl/">RTV Noord</a></td>
|
||||
</tr>
|
||||
<tr class="last">
|
||||
<td></td>
|
||||
<td class="bar"><div id="Sikkom"></div></td>
|
||||
<td><a href="https://sikkom.nl/">Sikkom</a></td>
|
||||
</tr>
|
||||
<tr class="first">
|
||||
<td>Literatuur</td>
|
||||
<td class="bar"><div id="LitNL"></div></td>
|
||||
<td>
|
||||
<a href="https://www.literairnederland.nl/">Literair Nederland</a>
|
||||
</td>
|
||||
</tr>
|
||||
<tr class="last">
|
||||
<td></td>
|
||||
<td class="bar"><div id="Tzum"></div></td>
|
||||
<td><a href="https://www.tzum.info/">Tzum</a></td>
|
||||
</tr>
|
||||
<tr class="first">
|
||||
<td>Vlaanderen</td>
|
||||
<td class="bar"><div id="HLN"></div></td>
|
||||
<td><a href="https://www.hln.be/">HLN</a></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td></td>
|
||||
<td class="bar"><div id="VRT"></div></td>
|
||||
<td><a href="https://www.vrt.be/vrtnws/nl/">VRT NWS</a></td>
|
||||
</tr>
|
||||
</table>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
|
||||
28
www/app.js
28
www/app.js
@@ -1,5 +1,6 @@
|
||||
var dates
|
||||
var datesNr
|
||||
var countsWeek
|
||||
|
||||
var parts = [
|
||||
'nieuwe namen',
|
||||
@@ -29,6 +30,18 @@ function sleep(ms) {
|
||||
return new Promise((resolve) => setTimeout(resolve, ms))
|
||||
}
|
||||
|
||||
function setCounts(week) {
|
||||
if (week != countsWeek) {
|
||||
countsWeek = week
|
||||
var max = data[week].max
|
||||
var pp = Object.entries(data[week].sources)
|
||||
for (var i in pp) {
|
||||
document.getElementById(pp[i][0]).style.width =
|
||||
(pp[i][1] / max) * 100 + '%'
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function getJSON(url) {
|
||||
return new Promise(function (resolve, reject) {
|
||||
var xhr = new XMLHttpRequest()
|
||||
@@ -120,7 +133,9 @@ function makeTD(title, values) {
|
||||
|
||||
async function loadSource(source, week) {
|
||||
if (!data.has(week)) {
|
||||
data[week] = await getJSON('DATA-' + week + '-4.json')
|
||||
data[week] = await getJSON(
|
||||
week.substring(0, 4) + '/DATA-' + week + '-4.json',
|
||||
)
|
||||
}
|
||||
|
||||
idSubtitle.innerHTML = source + ' — t/m ' + data[week].last
|
||||
@@ -136,11 +151,14 @@ async function loadSource(source, week) {
|
||||
tab.appendChild(tr)
|
||||
d.appendChild(tab)
|
||||
idData.innerHTML = d.innerHTML
|
||||
setCounts(week)
|
||||
}
|
||||
|
||||
async function loadPart(part, week) {
|
||||
if (!data.has(week)) {
|
||||
data[week] = await getJSON('DATA-' + week + '-4.json')
|
||||
data[week] = await getJSON(
|
||||
week.substring(0, 4) + '/DATA-' + week + '-4.json',
|
||||
)
|
||||
}
|
||||
|
||||
idSubtitle.innerHTML = part + ' — t/m ' + data[week].last
|
||||
@@ -156,6 +174,7 @@ async function loadPart(part, week) {
|
||||
tab.appendChild(tr)
|
||||
d.appendChild(tab)
|
||||
idData.innerHTML = d.innerHTML
|
||||
setCounts(week)
|
||||
}
|
||||
|
||||
async function loadWeken(source, part) {
|
||||
@@ -170,7 +189,9 @@ async function loadWeken(source, part) {
|
||||
if (i < dates.length) {
|
||||
var week = dates[i].week
|
||||
if (!data.has(week)) {
|
||||
data[week] = await getJSON('DATA-' + week + '-4.json')
|
||||
data[week] = await getJSON(
|
||||
week.substring(0, 4) + '/DATA-' + week + '-4.json',
|
||||
)
|
||||
}
|
||||
var values = data[week][source][part]
|
||||
tr.appendChild(makeTD('t/m ' + data[week].last, values))
|
||||
@@ -180,6 +201,7 @@ async function loadWeken(source, part) {
|
||||
tab.appendChild(tr)
|
||||
d.appendChild(tab)
|
||||
idData.innerHTML = d.innerHTML
|
||||
setCounts(dates[datesNr].week)
|
||||
}
|
||||
|
||||
function locateWeek(date) {
|
||||
|
||||
@@ -1,57 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
unset CDPATH
|
||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
||||
export TZ=Europe/Amsterdam
|
||||
|
||||
item=tag
|
||||
if [ "$1" = "cat" ]
|
||||
then
|
||||
item=cat
|
||||
shift
|
||||
fi
|
||||
|
||||
if [ "$1" = "" ]
|
||||
then
|
||||
ds=`date -d -7days +%G-%V`
|
||||
else
|
||||
case "$1" in
|
||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
||||
ds=$1
|
||||
;;
|
||||
*)
|
||||
echo INVALID
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
fi
|
||||
|
||||
w=4
|
||||
|
||||
cd /net/corpora/nlnieuws
|
||||
|
||||
collect() {
|
||||
declare -gA counts
|
||||
declare -i val
|
||||
count="$1"
|
||||
shift
|
||||
word="$*"
|
||||
val=${counts["$word"]}
|
||||
counts["$word"]=$(( $val + $count ))
|
||||
}
|
||||
|
||||
for file in $( find . -name $ds-$w.$item.txt -or $( week2files $ds $w | sed -e "s/data.dz/$item.txt/g" ) )
|
||||
do
|
||||
IFS=$'\n'
|
||||
for line in $(cat $file)
|
||||
do
|
||||
IFS=' '$'\t'$'\n'
|
||||
collect $line
|
||||
done
|
||||
done
|
||||
|
||||
for i in "${!counts[@]}"
|
||||
do
|
||||
printf "%8d\t%s\n" ${counts[$i]} "$i"
|
||||
#echo -e "${counts[$i]}\t$i"
|
||||
done | sort -f -k 2 | sort -n -r -k 1,1 -s
|
||||
70
www/mkAll.py
70
www/mkAll.py
@@ -1,70 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import sys
|
||||
import re
|
||||
import subprocess
|
||||
|
||||
head = '''<!doctype html>
|
||||
<html>
|
||||
<head>
|
||||
<title>{} {} week {}</title>
|
||||
<meta charset="utf-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||
<link rel="icon" href="favicon.ico" type="image/ico" />
|
||||
<link rel="stylesheet" href="style.css" />
|
||||
<link rel="stylesheet" type="text/css" href="tooltip.css" />
|
||||
<script type="text/javascript" src="tooltip.js"></script>
|
||||
</head>
|
||||
<body>
|
||||
<div class="title">
|
||||
<h1>{} {} week {}</h1>
|
||||
terugkijkend over vier weken
|
||||
</div>
|
||||
<div class="main">
|
||||
'''
|
||||
|
||||
tail=''' </div>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
|
||||
namen = {
|
||||
'algemeen': 'Algemeen',
|
||||
'VRT': 'Vlaanderen',
|
||||
'groningen': 'Groningen',
|
||||
'AT5': 'Amsterdam',
|
||||
'Tzum': 'Literatuur'
|
||||
}
|
||||
|
||||
verbose=False
|
||||
if sys.argv[1] == '-v':
|
||||
verbose=True
|
||||
sys.argv = sys.argv[:1] + sys.argv[2:]
|
||||
|
||||
ep=sys.argv[1]
|
||||
if not re.match('^2[0-9][0-9][0-9]-[0-5][0-9]$', ep):
|
||||
print("Ongeldig patroon '", ep, "', moet yyyy-ww zijn")
|
||||
sys.exit(1)
|
||||
|
||||
jaar=ep[:4]
|
||||
week=ep[5:].lstrip('0')
|
||||
|
||||
for base in ('algemeen', 'VRT', 'groningen', 'AT5', 'Tzum'):
|
||||
name = namen[base]
|
||||
with open(name + '.html', 'wt', encoding='utf-8') as fp:
|
||||
fp.write(head.format(name, jaar, week, name, jaar, week))
|
||||
fp.flush()
|
||||
for part in ('nieuwe-namen', 'nieuwe-woorden', 'personen', 'overige-namen', 'locaties', 'organisaties'):
|
||||
if verbose:
|
||||
print(base, part)
|
||||
if part == 'locaties':
|
||||
fp.write('</div>\n<div class="main next">\n')
|
||||
fp.flush()
|
||||
top = ''
|
||||
if part.startswith('nieuwe'):
|
||||
top = '.t20'
|
||||
subprocess.run(
|
||||
['./top2html.py', '../data/{}-{}-{}-4{}'.format(base, part, ep, top)],
|
||||
stdout = fp,
|
||||
check = False)
|
||||
fp.write(tail)
|
||||
@@ -1,673 +0,0 @@
|
||||
<!doctype html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Nieuwe namen 2026 week 15</title>
|
||||
<meta charset="utf-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||
<link rel="icon" href="favicon.ico" type="image/ico" />
|
||||
<link rel="stylesheet" href="style2.css" />
|
||||
<link rel="stylesheet" type="text/css" href="tooltip.css" />
|
||||
<script type="text/javascript" src="tooltip.js"></script>
|
||||
</head>
|
||||
<body>
|
||||
<div class="title">
|
||||
<h1>Nieuwe namen 2026 week 15</h1>
|
||||
terugkijkend over vier weken
|
||||
</div>
|
||||
<div class="main">
|
||||
<table class="outer">
|
||||
<tr>
|
||||
<td>
|
||||
<h2>Algemeen</h2>
|
||||
<table>
|
||||
<tr
|
||||
onmouseover="tooltip.show('23 Sjoerd Sjoerdsma<br><small>Politiek, Landelijk</small>')"
|
||||
onmouseout="tooltip.hide()"
|
||||
class="tags">
|
||||
<td><div style="width: 100%"></div></td>
|
||||
<td>Sjoerd Sjoerdsma</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('23 Wireless Festival<br><small>Entertainment, muziek</small>')"
|
||||
onmouseout="tooltip.hide()"
|
||||
class="tags">
|
||||
<td><div style="width: 100%"></div></td>
|
||||
<td>Wireless Festival</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('19 Dean James<br><small>voetbal</small>')"
|
||||
onmouseout="tooltip.hide()"
|
||||
class="tags">
|
||||
<td><div style="width: 83%"></div></td>
|
||||
<td>Dean James</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('19 Inspectie Gezondheidszorg en Jeugd<br><small>binnenland, Landelijk</small>')"
|
||||
onmouseout="tooltip.hide()"
|
||||
class="tags">
|
||||
<td><div style="width: 83%"></div></td>
|
||||
<td>Inspectie Gezondheidszorg en Jeugd</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('19 Jeremy Hansen<br><small>Buitenland, Artemis II, maan, wetenschap</small>')"
|
||||
onmouseout="tooltip.hide()"
|
||||
class="tags">
|
||||
<td><div style="width: 83%"></div></td>
|
||||
<td>Jeremy Hansen</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('17 Artemis II<br><small>Buitenland, nasa, tech</small>')"
|
||||
onmouseout="tooltip.hide()"
|
||||
class="tags">
|
||||
<td><div style="width: 74%"></div></td>
|
||||
<td>Artemis II</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('17 Lieke Klaver<br><small>sport-overig, Sport</small>')"
|
||||
onmouseout="tooltip.hide()"
|
||||
class="tags">
|
||||
<td><div style="width: 74%"></div></td>
|
||||
<td>Lieke Klaver</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('17 Progressief Nederland<br><small>Landelijk, GroenLinks, politiek, pro, PvdA</small>')"
|
||||
onmouseout="tooltip.hide()"
|
||||
class="tags">
|
||||
<td><div style="width: 74%"></div></td>
|
||||
<td>Progressief Nederland</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('17 Tjaronn Chery<br><small>Voetbal</small>')"
|
||||
onmouseout="tooltip.hide()"
|
||||
class="tags">
|
||||
<td><div style="width: 74%"></div></td>
|
||||
<td>Tjaronn Chery</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('16 Artemis II-missie<br><small>Buitenland, wetenschap</small>')"
|
||||
onmouseout="tooltip.hide()"
|
||||
class="tags">
|
||||
<td><div style="width: 70%"></div></td>
|
||||
<td>Artemis II-missie</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('16 Christina Koch<br><small>Buitenland, Artemis II, maan, wetenschap</small>')"
|
||||
onmouseout="tooltip.hide()"
|
||||
class="tags">
|
||||
<td><div style="width: 70%"></div></td>
|
||||
<td>Christina Koch</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('16 Popjournalist<br><small>Cultuur, Google Nieuws, muziek, Oncko van Kammen, Oncko van Kammen - Popjournalist, www.onckovankammen.nl</small>')"
|
||||
onmouseout="tooltip.hide()"
|
||||
class="tags">
|
||||
<td><div style="width: 70%"></div></td>
|
||||
<td>Popjournalist</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('16 Victor Glover<br><small>Buitenland, Artemis II, wetenschap</small>')"
|
||||
onmouseout="tooltip.hide()"
|
||||
class="tags">
|
||||
<td><div style="width: 70%"></div></td>
|
||||
<td>Victor Glover</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('15 Erling Haaland<br><small>voetbal</small>')"
|
||||
onmouseout="tooltip.hide()"
|
||||
class="tags">
|
||||
<td><div style="width: 65%"></div></td>
|
||||
<td>Erling Haaland</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('15 Etienne Vaessen<br><small>voetbal</small>')"
|
||||
onmouseout="tooltip.hide()"
|
||||
class="tags">
|
||||
<td><div style="width: 65%"></div></td>
|
||||
<td>Etienne Vaessen</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('15 Gennaro Gattuso<br><small>voetbal</small>')"
|
||||
onmouseout="tooltip.hide()"
|
||||
class="tags">
|
||||
<td><div style="width: 65%"></div></td>
|
||||
<td>Gennaro Gattuso</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('15 GL-PvdA<br><small>Landelijk, politiek</small>')"
|
||||
onmouseout="tooltip.hide()"
|
||||
class="tags">
|
||||
<td><div style="width: 65%"></div></td>
|
||||
<td>GL-PvdA</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('15 Hansi Flick<br><small>Voetbal</small>')"
|
||||
onmouseout="tooltip.hide()"
|
||||
class="tags">
|
||||
<td><div style="width: 65%"></div></td>
|
||||
<td>Hansi Flick</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('15 Heil Hitler<br><small>Entertainment, muziek</small>')"
|
||||
onmouseout="tooltip.hide()"
|
||||
class="tags">
|
||||
<td><div style="width: 65%"></div></td>
|
||||
<td>Heil Hitler</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('15 IDF<br><small>Buitenland</small>')"
|
||||
onmouseout="tooltip.hide()"
|
||||
class="tags">
|
||||
<td><div style="width: 65%"></div></td>
|
||||
<td>IDF</td>
|
||||
</tr>
|
||||
</table>
|
||||
</td>
|
||||
|
||||
<td>
|
||||
<h2>Groningen</h2>
|
||||
<table>
|
||||
<tr
|
||||
onmouseover="tooltip.show('2 Korreweg')"
|
||||
onmouseout="tooltip.hide()">
|
||||
<td><div style="width: 100%"></div></td>
|
||||
<td>Korreweg</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('2 Oosterhamriklaan')"
|
||||
onmouseout="tooltip.hide()">
|
||||
<td><div style="width: 100%"></div></td>
|
||||
<td>Oosterhamriklaan</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('2 Turfsingel')"
|
||||
onmouseout="tooltip.hide()">
|
||||
<td><div style="width: 100%"></div></td>
|
||||
<td>Turfsingel</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('2 Vrijheidsplein')"
|
||||
onmouseout="tooltip.hide()">
|
||||
<td><div style="width: 100%"></div></td>
|
||||
<td>Vrijheidsplein</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><div style="width: 0%"></div></td>
|
||||
<td> </td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><div style="width: 0%"></div></td>
|
||||
<td> </td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><div style="width: 0%"></div></td>
|
||||
<td> </td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><div style="width: 0%"></div></td>
|
||||
<td> </td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><div style="width: 0%"></div></td>
|
||||
<td> </td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><div style="width: 0%"></div></td>
|
||||
<td> </td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><div style="width: 0%"></div></td>
|
||||
<td> </td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><div style="width: 0%"></div></td>
|
||||
<td> </td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><div style="width: 0%"></div></td>
|
||||
<td> </td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><div style="width: 0%"></div></td>
|
||||
<td> </td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><div style="width: 0%"></div></td>
|
||||
<td> </td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><div style="width: 0%"></div></td>
|
||||
<td> </td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><div style="width: 0%"></div></td>
|
||||
<td> </td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><div style="width: 0%"></div></td>
|
||||
<td> </td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><div style="width: 0%"></div></td>
|
||||
<td> </td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><div style="width: 0%"></div></td>
|
||||
<td> </td>
|
||||
</tr>
|
||||
</table>
|
||||
</td>
|
||||
|
||||
<td>
|
||||
<h2>Amsterdam</h2>
|
||||
<table>
|
||||
<tr
|
||||
onmouseover="tooltip.show('4 Skatecafé')"
|
||||
onmouseout="tooltip.hide()">
|
||||
<td><div style="width: 100%"></div></td>
|
||||
<td>Skatecafé</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('3 De Vondeling')"
|
||||
onmouseout="tooltip.hide()">
|
||||
<td><div style="width: 75%"></div></td>
|
||||
<td>De Vondeling</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('3 Geuzenveld')"
|
||||
onmouseout="tooltip.hide()">
|
||||
<td><div style="width: 75%"></div></td>
|
||||
<td>Geuzenveld</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('3 Godts')"
|
||||
onmouseout="tooltip.hide()">
|
||||
<td><div style="width: 75%"></div></td>
|
||||
<td>Godts</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('3 Imane Nadif')"
|
||||
onmouseout="tooltip.hide()">
|
||||
<td><div style="width: 75%"></div></td>
|
||||
<td>Imane Nadif</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('2 Buikslotermeerplein')"
|
||||
onmouseout="tooltip.hide()">
|
||||
<td><div style="width: 50%"></div></td>
|
||||
<td>Buikslotermeerplein</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('2 Burgernet')"
|
||||
onmouseout="tooltip.hide()">
|
||||
<td><div style="width: 50%"></div></td>
|
||||
<td>Burgernet</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('2 Carel Willinkplantsoen')"
|
||||
onmouseout="tooltip.hide()">
|
||||
<td><div style="width: 50%"></div></td>
|
||||
<td>Carel Willinkplantsoen</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('2 Caro Ottenhof')"
|
||||
onmouseout="tooltip.hide()">
|
||||
<td><div style="width: 50%"></div></td>
|
||||
<td>Caro Ottenhof</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('2 Chun')"
|
||||
onmouseout="tooltip.hide()">
|
||||
<td><div style="width: 50%"></div></td>
|
||||
<td>Chun</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('2 Clen V.')"
|
||||
onmouseout="tooltip.hide()">
|
||||
<td><div style="width: 50%"></div></td>
|
||||
<td>Clen V.</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('2 Darlencho E.')"
|
||||
onmouseout="tooltip.hide()">
|
||||
<td><div style="width: 50%"></div></td>
|
||||
<td>Darlencho E.</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('2 De Nieuwe Meer')"
|
||||
onmouseout="tooltip.hide()">
|
||||
<td><div style="width: 50%"></div></td>
|
||||
<td>De Nieuwe Meer</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('2 Dolf Pasker')"
|
||||
onmouseout="tooltip.hide()">
|
||||
<td><div style="width: 50%"></div></td>
|
||||
<td>Dolf Pasker</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('2 Dorle Kok')"
|
||||
onmouseout="tooltip.hide()">
|
||||
<td><div style="width: 50%"></div></td>
|
||||
<td>Dorle Kok</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('2 Eid al-Fitr')"
|
||||
onmouseout="tooltip.hide()">
|
||||
<td><div style="width: 50%"></div></td>
|
||||
<td>Eid al-Fitr</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('2 Fabel Friet')"
|
||||
onmouseout="tooltip.hide()">
|
||||
<td><div style="width: 50%"></div></td>
|
||||
<td>Fabel Friet</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('2 Felyx')"
|
||||
onmouseout="tooltip.hide()">
|
||||
<td><div style="width: 50%"></div></td>
|
||||
<td>Felyx</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('2 Flevopark')"
|
||||
onmouseout="tooltip.hide()">
|
||||
<td><div style="width: 50%"></div></td>
|
||||
<td>Flevopark</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('2 Funda')"
|
||||
onmouseout="tooltip.hide()">
|
||||
<td><div style="width: 50%"></div></td>
|
||||
<td>Funda</td>
|
||||
</tr>
|
||||
</table>
|
||||
</td>
|
||||
|
||||
<td>
|
||||
<h2>Literatuur</h2>
|
||||
<table>
|
||||
<tr
|
||||
onmouseover="tooltip.show('3 Jacques Maes & Lise Braekers<br><small>Woutertje Pieterse Prijs</small>')"
|
||||
onmouseout="tooltip.hide()"
|
||||
class="tags">
|
||||
<td><div style="width: 100%"></div></td>
|
||||
<td>Jacques Maes & Lise Braekers</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('3 Konijntjes<br><small>Woutertje Pieterse Prijs</small>')"
|
||||
onmouseout="tooltip.hide()"
|
||||
class="tags">
|
||||
<td><div style="width: 100%"></div></td>
|
||||
<td>Konijntjes</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('3 Rik van de Westelaken<br><small>Woutertje Pieterse Prijs</small>')"
|
||||
onmouseout="tooltip.hide()"
|
||||
class="tags">
|
||||
<td><div style="width: 100%"></div></td>
|
||||
<td>Rik van de Westelaken</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('2 Charlotte Remarque')"
|
||||
onmouseout="tooltip.hide()">
|
||||
<td><div style="width: 67%"></div></td>
|
||||
<td>Charlotte Remarque</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('2 De Taalstaat<br><small>Woutertje Pieterse Prijs</small>')"
|
||||
onmouseout="tooltip.hide()"
|
||||
class="tags">
|
||||
<td><div style="width: 67%"></div></td>
|
||||
<td>De Taalstaat</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('2 Ezo Wolf<br><small>Radio</small>')"
|
||||
onmouseout="tooltip.hide()"
|
||||
class="tags">
|
||||
<td><div style="width: 67%"></div></td>
|
||||
<td>Ezo Wolf</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('2 Fixdit<br><small>Radio</small>')"
|
||||
onmouseout="tooltip.hide()"
|
||||
class="tags">
|
||||
<td><div style="width: 67%"></div></td>
|
||||
<td>Fixdit</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('2 Gescinska<br><small>Socratesbeker</small>')"
|
||||
onmouseout="tooltip.hide()"
|
||||
class="tags">
|
||||
<td><div style="width: 67%"></div></td>
|
||||
<td>Gescinska</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('2 Grote Gebeuren 2026<br><small>Filmpje</small>')"
|
||||
onmouseout="tooltip.hide()"
|
||||
class="tags">
|
||||
<td><div style="width: 67%"></div></td>
|
||||
<td>Grote Gebeuren 2026</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('2 Hebban')"
|
||||
onmouseout="tooltip.hide()">
|
||||
<td><div style="width: 67%"></div></td>
|
||||
<td>Hebban</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('2 Hedy Tjin')"
|
||||
onmouseout="tooltip.hide()">
|
||||
<td><div style="width: 67%"></div></td>
|
||||
<td>Hedy Tjin</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('2 Hession<br><small>Rónán Hession</small>')"
|
||||
onmouseout="tooltip.hide()"
|
||||
class="tags">
|
||||
<td><div style="width: 67%"></div></td>
|
||||
<td>Hession</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('2 Het Grote Gebeuren<br><small>Filmpje</small>')"
|
||||
onmouseout="tooltip.hide()"
|
||||
class="tags">
|
||||
<td><div style="width: 67%"></div></td>
|
||||
<td>Het Grote Gebeuren</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('2 Hungry Paul<br><small>Rónán Hession</small>')"
|
||||
onmouseout="tooltip.hide()"
|
||||
class="tags">
|
||||
<td><div style="width: 67%"></div></td>
|
||||
<td>Hungry Paul</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('2 Inktaap<br><small>De Inktaap, Joost Oomen</small>')"
|
||||
onmouseout="tooltip.hide()"
|
||||
class="tags">
|
||||
<td><div style="width: 67%"></div></td>
|
||||
<td>Inktaap</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('2 Jamal Ouariachi<br><small>Hendrik Groen</small>')"
|
||||
onmouseout="tooltip.hide()"
|
||||
class="tags">
|
||||
<td><div style="width: 67%"></div></td>
|
||||
<td>Jamal Ouariachi</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('2 Janneke Siebelink')"
|
||||
onmouseout="tooltip.hide()">
|
||||
<td><div style="width: 67%"></div></td>
|
||||
<td>Janneke Siebelink</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('2 Jazzportretten')"
|
||||
onmouseout="tooltip.hide()">
|
||||
<td><div style="width: 67%"></div></td>
|
||||
<td>Jazzportretten</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('2 Koenraad Tinel<br><small>Thomas Mann</small>')"
|
||||
onmouseout="tooltip.hide()"
|
||||
class="tags">
|
||||
<td><div style="width: 67%"></div></td>
|
||||
<td>Koenraad Tinel</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('2 Krekel van Annet Schaap')"
|
||||
onmouseout="tooltip.hide()">
|
||||
<td><div style="width: 67%"></div></td>
|
||||
<td>Krekel van Annet Schaap</td>
|
||||
</tr>
|
||||
</table>
|
||||
</td>
|
||||
|
||||
<td>
|
||||
<h2>Vlaanderen</h2>
|
||||
<table>
|
||||
<tr
|
||||
onmouseover="tooltip.show('8 Artemis II-missie<br><small>Ruimtevaart</small>')"
|
||||
onmouseout="tooltip.hide()"
|
||||
class="tags">
|
||||
<td><div style="width: 100%"></div></td>
|
||||
<td>Artemis II-missie</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('5 Christina Koch<br><small>Ruimtevaart</small>')"
|
||||
onmouseout="tooltip.hide()"
|
||||
class="tags">
|
||||
<td><div style="width: 62%"></div></td>
|
||||
<td>Christina Koch</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('5 Fedasil<br><small>Politiek</small>')"
|
||||
onmouseout="tooltip.hide()"
|
||||
class="tags">
|
||||
<td><div style="width: 62%"></div></td>
|
||||
<td>Fedasil</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('5 Hilde Crevits<br><small>Binnenland</small>')"
|
||||
onmouseout="tooltip.hide()"
|
||||
class="tags">
|
||||
<td><div style="width: 62%"></div></td>
|
||||
<td>Hilde Crevits</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('5 Jeremy Hansen<br><small>Ruimtevaart</small>')"
|
||||
onmouseout="tooltip.hide()"
|
||||
class="tags">
|
||||
<td><div style="width: 62%"></div></td>
|
||||
<td>Jeremy Hansen</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('5 Wegen<br><small>Mobiliteit</small>')"
|
||||
onmouseout="tooltip.hide()"
|
||||
class="tags">
|
||||
<td><div style="width: 62%"></div></td>
|
||||
<td>Wegen</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('4 AFAS Dome<br><small>Economie</small>')"
|
||||
onmouseout="tooltip.hide()"
|
||||
class="tags">
|
||||
<td><div style="width: 50%"></div></td>
|
||||
<td>AFAS Dome</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('4 Cel Vermiste Personen<br><small>Justitie</small>')"
|
||||
onmouseout="tooltip.hide()"
|
||||
class="tags">
|
||||
<td><div style="width: 50%"></div></td>
|
||||
<td>Cel Vermiste Personen</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('4 Deurganckdok<br><small>Economie</small>')"
|
||||
onmouseout="tooltip.hide()"
|
||||
class="tags">
|
||||
<td><div style="width: 50%"></div></td>
|
||||
<td>Deurganckdok</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('4 Edouard Philippe<br><small>Frankrijk</small>')"
|
||||
onmouseout="tooltip.hide()"
|
||||
class="tags">
|
||||
<td><div style="width: 50%"></div></td>
|
||||
<td>Edouard Philippe</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('4 Gonnissen<br><small>Economie</small>')"
|
||||
onmouseout="tooltip.hide()"
|
||||
class="tags">
|
||||
<td><div style="width: 50%"></div></td>
|
||||
<td>Gonnissen</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('4 Instituut voor Natuur- en Bosonderzoek<br><small>Milieu & Klimaat</small>')"
|
||||
onmouseout="tooltip.hide()"
|
||||
class="tags">
|
||||
<td><div style="width: 50%"></div></td>
|
||||
<td>Instituut voor Natuur- en Bosonderzoek</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('4 Internationaal Ruimtestation ISS<br><small>Ruimtevaart</small>')"
|
||||
onmouseout="tooltip.hide()"
|
||||
class="tags">
|
||||
<td><div style="width: 50%"></div></td>
|
||||
<td>Internationaal Ruimtestation ISS</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('4 Jan Haelters<br><small>Technologie & Wetenschap</small>')"
|
||||
onmouseout="tooltip.hide()"
|
||||
class="tags">
|
||||
<td><div style="width: 50%"></div></td>
|
||||
<td>Jan Haelters</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('4 Jan Loos<br><small>Milieu & Klimaat</small>')"
|
||||
onmouseout="tooltip.hide()"
|
||||
class="tags">
|
||||
<td><div style="width: 50%"></div></td>
|
||||
<td>Jan Loos</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('4 KMSKA<br><small>Expo</small>')"
|
||||
onmouseout="tooltip.hide()"
|
||||
class="tags">
|
||||
<td><div style="width: 50%"></div></td>
|
||||
<td>KMSKA</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('4 Natuur en Bos<br><small>Milieu & Klimaat</small>')"
|
||||
onmouseout="tooltip.hide()"
|
||||
class="tags">
|
||||
<td><div style="width: 50%"></div></td>
|
||||
<td>Natuur en Bos</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('4 Noëlla')"
|
||||
onmouseout="tooltip.hide()">
|
||||
<td><div style="width: 50%"></div></td>
|
||||
<td>Noëlla</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('4 Omgeving')"
|
||||
onmouseout="tooltip.hide()">
|
||||
<td><div style="width: 50%"></div></td>
|
||||
<td>Omgeving</td>
|
||||
</tr>
|
||||
<tr
|
||||
onmouseover="tooltip.show('4 Peter Bruyninckx van het Vlaams Verkeerscentrum<br><small>Mobiliteit</small>')"
|
||||
onmouseout="tooltip.hide()"
|
||||
class="tags">
|
||||
<td><div style="width: 50%"></div></td>
|
||||
<td>Peter Bruyninckx van het Vlaams Verkeerscentrum</td>
|
||||
</tr>
|
||||
</table>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
159
www/style.css
159
www/style.css
@@ -13,52 +13,16 @@ html {
|
||||
|
||||
body {
|
||||
border: 0px;
|
||||
margin: 0px;
|
||||
padding: 0px 0px 8em 0px;
|
||||
margin: 2em;
|
||||
padding: 0px 0px 2em 0px;
|
||||
color: black;
|
||||
background-color: #fcfffc;
|
||||
}
|
||||
|
||||
.main {
|
||||
margin: 2em 0px 0px 0px;
|
||||
padding: 0.4em 0.2em;
|
||||
|
||||
column-count: 4;
|
||||
column-width: 22em;
|
||||
|
||||
-webkit-column-gap: 0px;
|
||||
-moz-column-gap: 0px;
|
||||
column-gap: 0px;
|
||||
|
||||
column-rule: 1px solid lightgrey;
|
||||
}
|
||||
|
||||
.main > div {
|
||||
-webkit-column-break-inside: avoid;
|
||||
-moz-column-break-inside: avoid;
|
||||
-o-column-break-inside: avoid;
|
||||
-ms-column-break-inside: avoid;
|
||||
column-break-inside: avoid;
|
||||
column-fill: balance;
|
||||
page-break-inside: avoid;
|
||||
|
||||
/* zo moet het */
|
||||
break-inside: avoid;
|
||||
|
||||
padding: 0px 1em;
|
||||
margin: 0px 0.2em;
|
||||
overflow-x: hidden;
|
||||
}
|
||||
|
||||
.footer {
|
||||
text-align: center;
|
||||
font-size: small;
|
||||
margin-bottom: 2em;
|
||||
}
|
||||
|
||||
.title {
|
||||
text-align: center;
|
||||
padding-bottom: 4em;
|
||||
margin: 1em 0px 0px 0px;
|
||||
width: 100%;
|
||||
overflow-x: auto;
|
||||
}
|
||||
|
||||
h1 {
|
||||
@@ -66,7 +30,8 @@ h1 {
|
||||
}
|
||||
|
||||
h1,
|
||||
h2 {
|
||||
h2,
|
||||
h3 {
|
||||
color: #62757f;
|
||||
letter-spacing: 0.1em;
|
||||
margin-top: 2em;
|
||||
@@ -76,6 +41,11 @@ h1 {
|
||||
font-weight: 200;
|
||||
}
|
||||
h2 {
|
||||
line-height: 120%;
|
||||
font-size: x-large;
|
||||
font-weight: 300;
|
||||
}
|
||||
h3 {
|
||||
margin-top: 0px;
|
||||
font-size: large;
|
||||
font-weight: 400;
|
||||
@@ -94,23 +64,114 @@ a:hover {
|
||||
text-decoration: underline;
|
||||
}
|
||||
|
||||
table {
|
||||
border-collapse: collapse;
|
||||
border-spacing: 2em;
|
||||
div.option {
|
||||
display: inline-block;
|
||||
margin-right: 2em;
|
||||
}
|
||||
|
||||
td {
|
||||
padding: 0.2em 0.5em 0.2em 0px;
|
||||
table.outer > tr > td,
|
||||
table.outer > tbody > tr > td {
|
||||
padding: 0px 1em;
|
||||
border-left: 1px solid #62757f;
|
||||
}
|
||||
|
||||
table.outer > tr > td:first-child,
|
||||
table.outer > tbody > tr > td:first-child {
|
||||
border-left: 0px;
|
||||
padding-left: 0px;
|
||||
}
|
||||
|
||||
table table {
|
||||
width: 20em;
|
||||
min-width: 20em;
|
||||
max-width: 20em;
|
||||
table-layout: fixed;
|
||||
}
|
||||
|
||||
table table td {
|
||||
/* cursor: pointer; */
|
||||
padding: 0.2em 0px;
|
||||
vertical-align: center;
|
||||
white-space: nowrap;
|
||||
overflow-x: hidden;
|
||||
}
|
||||
|
||||
tr > td:first-child {
|
||||
table table tr > td:first-child {
|
||||
width: 100px;
|
||||
min-width: 100px;
|
||||
padding-right: 0.5em;
|
||||
}
|
||||
td > div {
|
||||
|
||||
table.bron td > div,
|
||||
table table td > div {
|
||||
height: 10px;
|
||||
margin-top: auto;
|
||||
background-color: #62757f;
|
||||
}
|
||||
|
||||
.form {
|
||||
line-height: 150%;
|
||||
}
|
||||
|
||||
table.choice {
|
||||
padding-top: 1em;
|
||||
border: 1px solid #62757f;
|
||||
}
|
||||
|
||||
table.choice td {
|
||||
vertical-align: top;
|
||||
text-align: left;
|
||||
padding: 0.4em 1.2em;
|
||||
}
|
||||
|
||||
label {
|
||||
cursor: pointer;
|
||||
}
|
||||
label:hover {
|
||||
text-decoration: underline;
|
||||
}
|
||||
|
||||
.disabled {
|
||||
opacity: 0.4;
|
||||
}
|
||||
|
||||
#subtitle,
|
||||
#data {
|
||||
opacity: 1;
|
||||
transition: opacity 200ms linear;
|
||||
}
|
||||
#subtitle.fade,
|
||||
#data.fade {
|
||||
opacity: 0;
|
||||
transition: opacity 20ms linear;
|
||||
}
|
||||
|
||||
td.bar {
|
||||
width: 200px;
|
||||
min-width: 200px;
|
||||
}
|
||||
|
||||
table.bron {
|
||||
border-collapse: collapse;
|
||||
}
|
||||
|
||||
table.bron td {
|
||||
padding: 0.2em 1em 0.2em 0px;
|
||||
}
|
||||
|
||||
tr.last td {
|
||||
padding-bottom: 0.5em;
|
||||
border-bottom: 1px solid #62757f;
|
||||
}
|
||||
|
||||
tr.first td {
|
||||
padding-top: 0.5em;
|
||||
}
|
||||
|
||||
#fDate:invalid + span::after {
|
||||
content: ' ✖';
|
||||
}
|
||||
|
||||
#fDate:valid + span::after {
|
||||
content: ' ✓';
|
||||
}
|
||||
|
||||
146
www/style2.css
146
www/style2.css
@@ -1,146 +0,0 @@
|
||||
/* */
|
||||
|
||||
* {
|
||||
box-sizing: border-box;
|
||||
-webkit-box-sizing: border-box;
|
||||
-moz-box-sizing: border-box;
|
||||
}
|
||||
|
||||
html {
|
||||
font-family: 'IBM Plex Serif', serif;
|
||||
font-size: 18px;
|
||||
}
|
||||
|
||||
body {
|
||||
border: 0px;
|
||||
margin: 2em;
|
||||
padding: 0px 0px 2em 0px;
|
||||
color: black;
|
||||
background-color: #fcfffc;
|
||||
}
|
||||
|
||||
.main {
|
||||
margin: 1em 0px 0px 0px;
|
||||
width: 100%;
|
||||
overflow-x: auto;
|
||||
}
|
||||
|
||||
h1 {
|
||||
margin-top: 4em;
|
||||
}
|
||||
|
||||
h1,
|
||||
h2,
|
||||
h3 {
|
||||
color: #62757f;
|
||||
letter-spacing: 0.1em;
|
||||
margin-top: 2em;
|
||||
}
|
||||
h1 {
|
||||
font-size: xx-large;
|
||||
font-weight: 200;
|
||||
}
|
||||
h2 {
|
||||
line-height: 120%;
|
||||
font-size: x-large;
|
||||
font-weight: 300;
|
||||
}
|
||||
h3 {
|
||||
margin-top: 0px;
|
||||
font-size: large;
|
||||
font-weight: 400;
|
||||
}
|
||||
|
||||
.tags {
|
||||
color: #0000ee;
|
||||
}
|
||||
|
||||
a {
|
||||
text-decoration: none;
|
||||
color: #0000ee;
|
||||
}
|
||||
|
||||
a:hover {
|
||||
text-decoration: underline;
|
||||
}
|
||||
|
||||
div.option {
|
||||
display: inline-block;
|
||||
margin-right: 2em;
|
||||
}
|
||||
|
||||
table.outer > tr > td,
|
||||
table.outer > tbody > tr > td {
|
||||
padding: 0px 1em;
|
||||
border-left: 1px solid #62757f;
|
||||
}
|
||||
|
||||
table.outer > tr > td:first-child,
|
||||
table.outer > tbody > tr > td:first-child {
|
||||
border-left: 0px;
|
||||
padding-left: 0px;
|
||||
}
|
||||
|
||||
table table {
|
||||
width: 20em;
|
||||
min-width: 20em;
|
||||
max-width: 20em;
|
||||
table-layout: fixed;
|
||||
}
|
||||
|
||||
table table td {
|
||||
/* cursor: pointer; */
|
||||
padding: 0.2em 0px;
|
||||
vertical-align: center;
|
||||
white-space: nowrap;
|
||||
overflow-x: hidden;
|
||||
}
|
||||
|
||||
table table tr > td:first-child {
|
||||
width: 100px;
|
||||
min-width: 100px;
|
||||
padding-right: 0.5em;
|
||||
}
|
||||
|
||||
table table td > div {
|
||||
height: 10px;
|
||||
margin-top: auto;
|
||||
background-color: #62757f;
|
||||
}
|
||||
|
||||
.form {
|
||||
line-height: 150%;
|
||||
}
|
||||
|
||||
table.choice {
|
||||
padding-top: 1em;
|
||||
border: 1px solid #62757f;
|
||||
}
|
||||
|
||||
table.choice td {
|
||||
vertical-align: top;
|
||||
text-align: left;
|
||||
padding: 0.4em 1.2em;
|
||||
}
|
||||
|
||||
label {
|
||||
cursor: pointer;
|
||||
}
|
||||
label:hover {
|
||||
text-decoration: underline;
|
||||
}
|
||||
|
||||
.disabled {
|
||||
opacity: 0.4;
|
||||
}
|
||||
|
||||
#subtitle,
|
||||
#data {
|
||||
opacity: 1;
|
||||
transition: opacity 200ms linear;
|
||||
}
|
||||
#subtitle.fade,
|
||||
#data.fade {
|
||||
opacity: 0;
|
||||
transition: opacity 20ms linear;
|
||||
}
|
||||
@@ -1,21 +0,0 @@
|
||||
<!doctype html>
|
||||
<html>
|
||||
<head>
|
||||
<title><!--TITLE--></title>
|
||||
<meta charset="utf-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||
<link rel="icon" href="favicon.ico" type="image/ico" />
|
||||
<link rel="stylesheet" href="style.css" />
|
||||
<link rel="stylesheet" type="text/css" href="tooltip.css" />
|
||||
<script type="text/javascript" src="tooltip.js"></script>
|
||||
</head>
|
||||
<body>
|
||||
<div class="title">
|
||||
<h1><!--TITLE--></h1>
|
||||
terugkijkend over vier weken
|
||||
</div>
|
||||
<div class="main">
|
||||
<!--MAIN-->
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
@@ -1,65 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import html, sys
|
||||
|
||||
titles = {
|
||||
'nieuwe-namen': 'nieuwe namen',
|
||||
'nieuwe-woorden': 'nieuwe woorden',
|
||||
'locaties':'locaties',
|
||||
'personen':'personen',
|
||||
'organisaties':'organisaties',
|
||||
'overige-namen':'andere namen',
|
||||
'nieuwe-adjww':'nieuwe adjectieven, deelwoorden en werkwoorden',
|
||||
'cat.txt': 'categoriën',
|
||||
'tag.txt': 'tags'
|
||||
}
|
||||
|
||||
def e(s):
|
||||
return s.replace(''', '&#x27;')
|
||||
|
||||
omt0 = ''' onmouseover="tooltip.show('{} {}')" onmouseout="tooltip.hide()"'''
|
||||
omt1 = ''' onmouseover="tooltip.show('{} {}<br><small>{}</small>')" onmouseout="tooltip.hide()" class="tags"'''
|
||||
|
||||
title = sys.argv[1]
|
||||
for key, value in titles.items():
|
||||
if sys.argv[1].find(key) >= 0:
|
||||
title = value
|
||||
break
|
||||
|
||||
sys.stdout.buffer.write('''<div>
|
||||
<h2>{}</h2>
|
||||
<table>
|
||||
'''.format(html.escape(title)).encode('utf-8'))
|
||||
|
||||
cols=0
|
||||
with open(sys.argv[1], 'rt', encoding='utf-8') as fp:
|
||||
lineno = 0
|
||||
mx = 0
|
||||
for line in fp:
|
||||
line = line.strip()
|
||||
aa = line.split('\t')
|
||||
if len(aa) == 1:
|
||||
bb = line.split()
|
||||
aa[0] = bb[0]
|
||||
aa.append(' '.join(bb[1:]))
|
||||
for i in range(1, len(aa)):
|
||||
aa[i] = html.escape(aa[i])
|
||||
v = int(aa[0])
|
||||
if lineno == 0:
|
||||
mx = v
|
||||
cols=len(aa)
|
||||
p = 100 / mx * v
|
||||
if len(aa) > 2:
|
||||
mo = omt1.format(e(aa[0]), e(aa[1]), e(aa[2]))
|
||||
else:
|
||||
mo = omt0.format(e(aa[0]), e(aa[1]))
|
||||
sys.stdout.buffer.write('<tr{}><td><div style="width:{:.0f}%"></div><td>{}</tr>\n'.format(mo, p, aa[1]).encode('utf-8'))
|
||||
lineno += 1
|
||||
if lineno == 20:
|
||||
break
|
||||
while lineno < 20:
|
||||
lineno += 1
|
||||
sys.stdout.buffer.write(b'<tr><td><div style="width:0%"></div><td> </tr>\n')
|
||||
|
||||
|
||||
sys.stdout.buffer.write(b'</table>\n</div>\n\n')
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user