first commit
This commit is contained in:
29
.gitignore
vendored
Normal file
29
.gitignore
vendored
Normal file
@@ -0,0 +1,29 @@
|
||||
/Amsterdam
|
||||
/AT5/at5
|
||||
/AT5/metadata
|
||||
/AT5/xml2txt
|
||||
/GG/gg
|
||||
/GG/metadata
|
||||
/NieuwsNL/metadata
|
||||
/NieuwsNL/nieuwsnl
|
||||
/NOS/json2txt
|
||||
/NOS/metadata
|
||||
/NOS/nos
|
||||
/NU/metadata
|
||||
/NU/nu
|
||||
/RO/metadata
|
||||
/RO/ro
|
||||
/RO/xml2txt
|
||||
/Sargasso/metadata
|
||||
/Sargasso/sargasso
|
||||
/Sargasso/xml2txt
|
||||
/Sikkom/metadata
|
||||
/Sikkom/sikkom
|
||||
/Tzum/metadata
|
||||
/Tzum/tzum
|
||||
/Tzum/xml2txt
|
||||
/VRT/metadata
|
||||
/VRT/vrt
|
||||
/bin/ISOWeek
|
||||
20??
|
||||
corpus
|
||||
13
AT5/Makefile
Normal file
13
AT5/Makefile
Normal file
@@ -0,0 +1,13 @@
|
||||
all: \
|
||||
xml2txt \
|
||||
metadata \
|
||||
at5
|
||||
|
||||
xml2txt: cmd/xml2txt/*.go
|
||||
go build -o $@ $^
|
||||
|
||||
metadata: cmd/metadata/*.go
|
||||
go build -o $@ $^
|
||||
|
||||
at5: cmd/at5/*.go
|
||||
go build -o $@ $^
|
||||
81
AT5/cmd/at5/at5.go
Normal file
81
AT5/cmd/at5/at5.go
Normal file
@@ -0,0 +1,81 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"github.com/pebbe/util"
|
||||
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type Rss struct {
|
||||
XMLName xml.Name `xml:"rss"`
|
||||
Items []ItemT `xml:"channel>item"`
|
||||
}
|
||||
|
||||
type ItemT struct {
|
||||
PubDate string `xml:"pubDate"`
|
||||
UnixTime int64 `xml:"unixTime"`
|
||||
Guid string `xml:"guid"`
|
||||
Data []byte `xml:",innerxml"`
|
||||
}
|
||||
|
||||
var (
|
||||
x = util.CheckErr
|
||||
agent = "AhrefsBot/7.0"
|
||||
)
|
||||
|
||||
func main() {
|
||||
req, err := http.NewRequest("GET", "https://rss.at5.nl/rss", nil)
|
||||
x(err)
|
||||
req.Header.Set("User-Agent", agent)
|
||||
|
||||
client := &http.Client{}
|
||||
resp, err := client.Do(req)
|
||||
x(err)
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
x(err)
|
||||
x(resp.Body.Close())
|
||||
|
||||
var rss Rss
|
||||
x(xml.Unmarshal(body, &rss))
|
||||
|
||||
if len(rss.Items) == 0 {
|
||||
x(fmt.Errorf("len(rss.Items) == 0"))
|
||||
}
|
||||
|
||||
for _, item := range rss.Items {
|
||||
t, err := time.Parse(time.RFC1123Z, item.PubDate)
|
||||
if err != nil {
|
||||
t, err = time.Parse(time.RFC1123, item.PubDate)
|
||||
}
|
||||
x(err)
|
||||
year, week := t.ISOWeek()
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/AT5/%d/%02d", year, week)
|
||||
basename := strings.TrimPrefix(item.Guid, "https://www.at5.nl/artikelen/")
|
||||
if i := strings.LastIndex(basename, "/"); i > 0 {
|
||||
basename = basename[:i]
|
||||
}
|
||||
filename := dirname + "/" + url.PathEscape(basename)
|
||||
|
||||
x(os.MkdirAll(dirname, 0777))
|
||||
fp, err := os.Create(filename + ".xml")
|
||||
x(err)
|
||||
_, err = fp.WriteString("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<item>\n")
|
||||
x(err)
|
||||
_, err = fmt.Fprintf(fp, "<unixTime>%d</unixTime>", t.Unix())
|
||||
x(err)
|
||||
_, err = fp.Write(item.Data)
|
||||
x(err)
|
||||
_, err = fp.WriteString("</item>\n")
|
||||
x(err)
|
||||
x(fp.Close())
|
||||
x(os.Chtimes(filename+".xml", t, t))
|
||||
}
|
||||
|
||||
}
|
||||
95
AT5/cmd/metadata/metadata.go
Normal file
95
AT5/cmd/metadata/metadata.go
Normal file
@@ -0,0 +1,95 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"github.com/pebbe/util"
|
||||
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type Item struct {
|
||||
XMLName xml.Name `xml:"item"`
|
||||
UnixTime int64 `xml:"unixTime"`
|
||||
}
|
||||
|
||||
var (
|
||||
x = util.CheckErr
|
||||
data = make(map[string][]string)
|
||||
location *time.Location
|
||||
)
|
||||
|
||||
func main() {
|
||||
var err error
|
||||
location, err = time.LoadLocation("Europe/Amsterdam")
|
||||
x(err)
|
||||
|
||||
files, err := os.ReadDir("..")
|
||||
x(err)
|
||||
for _, file := range files {
|
||||
filename := file.Name()
|
||||
if strings.HasSuffix(filename, ".xml") {
|
||||
doXml("../", filename)
|
||||
}
|
||||
}
|
||||
|
||||
files, err = os.ReadDir("xml")
|
||||
x(err)
|
||||
for _, file := range files {
|
||||
filename := file.Name()
|
||||
if !strings.HasSuffix(filename, ".xml") {
|
||||
continue
|
||||
}
|
||||
aa := strings.Split(filename, ".")
|
||||
base := strings.Join(aa[1:len(aa)-2], ".")
|
||||
b, err := os.ReadFile("xml/" + filename)
|
||||
x(err)
|
||||
s := string(b)
|
||||
i := strings.Index(s, "<alpino") + 1
|
||||
i += strings.Index(s[i:], "<")
|
||||
fp, err := os.Create("xml/" + filename + ".tmp")
|
||||
x(err)
|
||||
_, err = fp.WriteString(s[:i])
|
||||
x(err)
|
||||
_, err = fp.WriteString("<metadata>\n <meta type=\"text\" name=\"source\" value=\"AT5\"/>\n")
|
||||
x(err)
|
||||
for _, m := range data[base] {
|
||||
_, err = fp.WriteString(" " + m + "\n")
|
||||
x(err)
|
||||
}
|
||||
_, err = fp.WriteString(" </metadata>\n ")
|
||||
x(err)
|
||||
_, err = fp.WriteString(stripMeta(s[i:]))
|
||||
x(err)
|
||||
x(fp.Close())
|
||||
x(os.Rename("xml/"+filename+".tmp", "xml/"+filename))
|
||||
}
|
||||
}
|
||||
|
||||
func doXml(dirname, filename string) {
|
||||
base := filename[:len(filename)-4]
|
||||
if _, ok := data[base]; !ok {
|
||||
data[base] = make([]string, 0)
|
||||
}
|
||||
b, err := os.ReadFile(dirname + filename)
|
||||
x(err)
|
||||
var item Item
|
||||
x(xml.Unmarshal(b, &item), filename)
|
||||
t := time.Unix(item.UnixTime, 0).In(location)
|
||||
data[base] = append(data[base],
|
||||
fmt.Sprintf(`<meta type="date" name="pubdate" value="%d-%02d-%02d"/>`,
|
||||
t.Year(),
|
||||
int(t.Month()),
|
||||
t.Day()))
|
||||
}
|
||||
|
||||
func stripMeta(s string) string {
|
||||
i1 := strings.Index(s, "<metadata>")
|
||||
if i1 < 0 {
|
||||
return s
|
||||
}
|
||||
i2 := i1 + strings.Index(s[i1:], "</metadata>") + 11
|
||||
return s[:i1] + strings.TrimLeft(s[i2:], " \t\r\n")
|
||||
}
|
||||
92
AT5/cmd/xml2txt/xml2txt.go
Normal file
92
AT5/cmd/xml2txt/xml2txt.go
Normal file
@@ -0,0 +1,92 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"github.com/jbowtie/gokogiri"
|
||||
"github.com/pebbe/util"
|
||||
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"os"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type Item struct {
|
||||
Title string `xml:"title"`
|
||||
Text string `xml:"description"`
|
||||
}
|
||||
|
||||
var (
|
||||
x = util.CheckErr
|
||||
|
||||
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`)
|
||||
)
|
||||
|
||||
func main() {
|
||||
|
||||
var ds string
|
||||
switch len(os.Args) {
|
||||
case 1:
|
||||
year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
|
||||
ds = fmt.Sprintf("%d-%02d", year, week)
|
||||
case 2:
|
||||
if !reYearWeek.MatchString(os.Args[1]) {
|
||||
x(fmt.Errorf("arg must be yyyy-ww"))
|
||||
}
|
||||
ds = os.Args[1]
|
||||
default:
|
||||
x(fmt.Errorf("too many arguments"))
|
||||
}
|
||||
dp := ds[:4] + "/" + ds[5:]
|
||||
|
||||
x(os.Chdir("/net/corpora/nlnieuws/AT5/" + dp))
|
||||
x(os.MkdirAll("out", 0777))
|
||||
files, err := os.ReadDir(".")
|
||||
x(err)
|
||||
for _, file := range files {
|
||||
filename := file.Name()
|
||||
if !strings.HasSuffix(filename, ".xml") {
|
||||
continue
|
||||
}
|
||||
b, err := os.ReadFile(filename)
|
||||
x(err)
|
||||
fp, err := os.Create("out/" + filename[:len(filename)-4] + ".txt")
|
||||
x(err)
|
||||
var item Item
|
||||
x(xml.Unmarshal(b, &item), filename)
|
||||
_, err = fp.WriteString(addEnd(item.Title))
|
||||
x(err)
|
||||
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
|
||||
x(err)
|
||||
root := doc.Root()
|
||||
pp, err := root.Search(`//body/p | //body/h2`)
|
||||
x(err)
|
||||
for _, p := range pp {
|
||||
_, err = fp.WriteString(addEnd(p.Content()))
|
||||
x(err)
|
||||
}
|
||||
x(err)
|
||||
x(fp.Close())
|
||||
}
|
||||
}
|
||||
|
||||
func addEnd(s string) string {
|
||||
s = strings.TrimSpace(s)
|
||||
n := len(s)
|
||||
if n == 0 {
|
||||
return ""
|
||||
}
|
||||
if n > 0 {
|
||||
if strings.ContainsAny(s[n-1:], ".!?") {
|
||||
return s + "\n"
|
||||
}
|
||||
}
|
||||
if n > 1 {
|
||||
s2 := s[n-2:]
|
||||
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
|
||||
return s + "\n"
|
||||
}
|
||||
}
|
||||
return s + ".\n"
|
||||
}
|
||||
65
AT5/txt2corpus.sh
Executable file
65
AT5/txt2corpus.sh
Executable file
@@ -0,0 +1,65 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
unset CDPATH
|
||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
||||
export TZ=Europe/Amsterdam
|
||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||
|
||||
if [ "$1" = "" ]
|
||||
then
|
||||
ds=`ISODate -7`
|
||||
else
|
||||
case "$1" in
|
||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
||||
ds=$1
|
||||
;;
|
||||
*)
|
||||
echo INVALID
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
fi
|
||||
|
||||
dp=${ds//-//}
|
||||
|
||||
corpus=/net/corpora/nlnieuws/AT5/corpus/$ds
|
||||
|
||||
cd /net/corpora/nlnieuws/AT5/$dp
|
||||
|
||||
ln -s lock.$$ lock
|
||||
if [ "`readlink lock`" != lock.$$ ]
|
||||
then
|
||||
echo Getting lock failed
|
||||
exit 1
|
||||
fi
|
||||
|
||||
rm -fr out
|
||||
mkdir out
|
||||
|
||||
../../xml2txt $ds
|
||||
|
||||
rm -f $corpus.lines
|
||||
for i in out/*.txt
|
||||
do
|
||||
b=`basename $i .txt`
|
||||
perl -p -e 's/^\s*//; s/^##META.*\n//' $i | tokenize.sh \
|
||||
| perl -e '$n = 0; while(<>) { $n++; print("at5.'$b'.$n|$_"); }' \
|
||||
>> $corpus.lines
|
||||
done
|
||||
|
||||
cd out
|
||||
mkdir xml
|
||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||
|
||||
../../../metadata 2> err
|
||||
rm err
|
||||
|
||||
cd xml
|
||||
alto -o $corpus.data.dz *.xml 2> /dev/null
|
||||
|
||||
cd ../..
|
||||
rm -fr out
|
||||
|
||||
rm -f lock
|
||||
9
GG/Makefile
Normal file
9
GG/Makefile
Normal file
@@ -0,0 +1,9 @@
|
||||
all: \
|
||||
metadata \
|
||||
gg
|
||||
|
||||
metadata: cmd/metadata/*.go
|
||||
go build -o $@ $^
|
||||
|
||||
gg: cmd/gg/*.go
|
||||
go build -o $@ $^
|
||||
195
GG/cmd/gg/gg.go
Normal file
195
GG/cmd/gg/gg.go
Normal file
@@ -0,0 +1,195 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"github.com/jbowtie/gokogiri"
|
||||
"github.com/pebbe/util"
|
||||
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type Rss struct {
|
||||
XMLName xml.Name `xml:"rss"`
|
||||
Items []ItemT `xml:"channel>item"`
|
||||
}
|
||||
|
||||
type ItemT struct {
|
||||
Title string `xml:"title"`
|
||||
PubDate string `xml:"pubDate"`
|
||||
UnixTime int64 `xml:"unixTime"`
|
||||
Guid string `xml:"guid"`
|
||||
Link string `xml:"link"`
|
||||
Data []byte `xml:",innerxml"`
|
||||
}
|
||||
|
||||
var (
|
||||
x = util.CheckErr
|
||||
w = util.WarnErr
|
||||
// agent = "AhrefsBot/7.0"
|
||||
agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36"
|
||||
)
|
||||
|
||||
func exists(filename string) bool {
|
||||
_, err := os.Stat(filename)
|
||||
return err == nil
|
||||
}
|
||||
|
||||
func fileDate(filename string) string {
|
||||
b, err := os.ReadFile(filename)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
s := string(b)
|
||||
i1 := strings.Index(s, "<unixTime>") + 10
|
||||
i2 := strings.Index(s, "</unixTime>")
|
||||
return s[i1:i2]
|
||||
}
|
||||
|
||||
func main() {
|
||||
req, err := http.NewRequest("GET", "https://gemeente.groningen.nl/feed/rss/nieuws", nil)
|
||||
x(err)
|
||||
req.Header.Set("User-Agent", agent)
|
||||
|
||||
client := &http.Client{}
|
||||
resp, err := client.Do(req)
|
||||
x(err)
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
x(err)
|
||||
x(resp.Body.Close())
|
||||
|
||||
var rss Rss
|
||||
x(xml.Unmarshal(body, &rss))
|
||||
|
||||
if len(rss.Items) == 0 {
|
||||
x(fmt.Errorf("len(rss.Items) == 0"))
|
||||
}
|
||||
|
||||
for _, item := range rss.Items {
|
||||
t, err := time.Parse(time.RFC1123Z, item.PubDate)
|
||||
if err != nil {
|
||||
t, err = time.Parse(time.RFC1123, item.PubDate)
|
||||
}
|
||||
x(err)
|
||||
year, week := t.ISOWeek()
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/GG/%d/%02d", year, week)
|
||||
filename := dirname + "/" + url.PathEscape(item.Guid)
|
||||
|
||||
ts := fmt.Sprintf("%d", t.Unix())
|
||||
needUpdate := fileDate(filename+".xml") != ts
|
||||
|
||||
x(os.MkdirAll(dirname, 0777))
|
||||
fp, err := os.Create(filename + ".xml")
|
||||
x(err)
|
||||
_, err = fp.WriteString("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<item>\n")
|
||||
x(err)
|
||||
_, err = fmt.Fprintf(fp, "<unixTime>%d</unixTime>", t.Unix())
|
||||
x(err)
|
||||
_, err = fp.Write(item.Data)
|
||||
x(err)
|
||||
_, err = fp.WriteString("</item>\n")
|
||||
x(err)
|
||||
x(fp.Close())
|
||||
x(os.Chtimes(filename+".xml", t, t))
|
||||
if !doArticle(filename, item.Link, item.Title, t, needUpdate) {
|
||||
x(os.Remove(filename + ".xml"))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func doArticle(filename string, url string, title string, timestamp time.Time, needUpdate bool) bool {
|
||||
if needUpdate {
|
||||
_ = os.Remove(filename + ".err")
|
||||
_ = os.Remove(filename + ".html")
|
||||
_ = os.Remove(filename + ".txt")
|
||||
_ = os.Remove(filename + ".skip")
|
||||
} else {
|
||||
if exists(filename+".txt") || exists(filename+".skip") {
|
||||
return true
|
||||
}
|
||||
}
|
||||
time.Sleep(2 * time.Second)
|
||||
|
||||
req, err := http.NewRequest("GET", url, nil)
|
||||
x(err)
|
||||
req.Header.Set("User-Agent", agent)
|
||||
|
||||
client := &http.Client{}
|
||||
resp, err := client.Do(req)
|
||||
x(err)
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
x(err)
|
||||
x(resp.Body.Close())
|
||||
|
||||
doc, err := gokogiri.ParseHtml(body)
|
||||
x(err)
|
||||
|
||||
root := doc.Root()
|
||||
|
||||
ell, err := root.Search(
|
||||
`//div[contains(@class,"component-richtext")]/p` +
|
||||
` | ` +
|
||||
`//div[contains(@class,"component-richtext")]/h2`)
|
||||
x(err)
|
||||
if len(ell) == 0 {
|
||||
_ = w(fmt.Errorf("empty: %s", url))
|
||||
|
||||
fp, err := os.Create(filename + ".err")
|
||||
x(err)
|
||||
_, err = fmt.Fprintf(fp, "empty: %s\n", url)
|
||||
x(err)
|
||||
x(fp.Close())
|
||||
x(os.Chtimes(filename+".err", timestamp, timestamp))
|
||||
|
||||
fp, err = os.Create(filename + ".html")
|
||||
x(err)
|
||||
_, err = fp.Write(body)
|
||||
x(err)
|
||||
x(fp.Close())
|
||||
x(os.Chtimes(filename+".html", timestamp, timestamp))
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
fp, err := os.Create(filename + ".txt")
|
||||
x(err)
|
||||
|
||||
_, err = fp.WriteString(addEnd(title))
|
||||
x(err)
|
||||
|
||||
for _, el := range ell {
|
||||
_, err = fp.WriteString(addEnd(el.Content()))
|
||||
x(err)
|
||||
}
|
||||
|
||||
x(fp.Close())
|
||||
|
||||
x(os.Chtimes(filename+".txt", timestamp, timestamp))
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
func addEnd(s string) string {
|
||||
s = strings.TrimSpace(s)
|
||||
n := len(s)
|
||||
if n == 0 {
|
||||
return ""
|
||||
}
|
||||
if n > 0 {
|
||||
if strings.ContainsAny(s[n-1:], ".!?") {
|
||||
return s + "\n"
|
||||
}
|
||||
}
|
||||
if n > 1 {
|
||||
s2 := s[n-2:]
|
||||
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
|
||||
return s + "\n"
|
||||
}
|
||||
}
|
||||
return s + ".\n"
|
||||
}
|
||||
95
GG/cmd/metadata/metadata.go
Normal file
95
GG/cmd/metadata/metadata.go
Normal file
@@ -0,0 +1,95 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"github.com/pebbe/util"
|
||||
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type Item struct {
|
||||
XMLName xml.Name `xml:"item"`
|
||||
UnixTime int64 `xml:"unixTime"`
|
||||
}
|
||||
|
||||
var (
|
||||
x = util.CheckErr
|
||||
data = make(map[string][]string)
|
||||
location *time.Location
|
||||
)
|
||||
|
||||
func main() {
|
||||
var err error
|
||||
location, err = time.LoadLocation("Europe/Amsterdam")
|
||||
x(err)
|
||||
|
||||
files, err := os.ReadDir("..")
|
||||
x(err)
|
||||
for _, file := range files {
|
||||
filename := file.Name()
|
||||
if strings.HasSuffix(filename, ".xml") {
|
||||
doXml(filename)
|
||||
}
|
||||
}
|
||||
|
||||
files, err = os.ReadDir("xml")
|
||||
x(err)
|
||||
for _, file := range files {
|
||||
filename := file.Name()
|
||||
if !strings.HasSuffix(filename, ".xml") {
|
||||
continue
|
||||
}
|
||||
aa := strings.Split(filename, ".")
|
||||
base := strings.Join(aa[1:len(aa)-2], ".")
|
||||
b, err := os.ReadFile("xml/" + filename)
|
||||
x(err)
|
||||
s := string(b)
|
||||
i := strings.Index(s, "<alpino") + 1
|
||||
i += strings.Index(s[i:], "<")
|
||||
fp, err := os.Create("xml/" + filename + ".tmp")
|
||||
x(err)
|
||||
_, err = fp.WriteString(s[:i])
|
||||
x(err)
|
||||
_, err = fp.WriteString("<metadata>\n <meta type=\"text\" name=\"source\" value=\"Gemeente Groningen\"/>\n")
|
||||
x(err)
|
||||
for _, m := range data[base] {
|
||||
_, err = fp.WriteString(" " + m + "\n")
|
||||
x(err)
|
||||
}
|
||||
_, err = fp.WriteString(" </metadata>\n ")
|
||||
x(err)
|
||||
_, err = fp.WriteString(stripMeta(s[i:]))
|
||||
x(err)
|
||||
x(fp.Close())
|
||||
x(os.Rename("xml/"+filename+".tmp", "xml/"+filename))
|
||||
}
|
||||
}
|
||||
|
||||
func doXml(filename string) {
|
||||
base := filename[:len(filename)-4]
|
||||
if _, ok := data[base]; !ok {
|
||||
data[base] = make([]string, 0)
|
||||
}
|
||||
b, err := os.ReadFile("../" + filename)
|
||||
x(err)
|
||||
var item Item
|
||||
x(xml.Unmarshal(b, &item))
|
||||
t := time.Unix(item.UnixTime, 0).In(location)
|
||||
data[base] = append(data[base],
|
||||
fmt.Sprintf(`<meta type="date" name="pubdate" value="%d-%02d-%02d"/>`,
|
||||
t.Year(),
|
||||
int(t.Month()),
|
||||
t.Day()))
|
||||
}
|
||||
|
||||
func stripMeta(s string) string {
|
||||
i1 := strings.Index(s, "<metadata>")
|
||||
if i1 < 0 {
|
||||
return s
|
||||
}
|
||||
i2 := i1 + strings.Index(s[i1:], "</metadata>") + 11
|
||||
return s[:i1] + strings.TrimLeft(s[i2:], " \t\r\n")
|
||||
}
|
||||
63
GG/txt2corpus.sh
Executable file
63
GG/txt2corpus.sh
Executable file
@@ -0,0 +1,63 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
unset CDPATH
|
||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
||||
export TZ=Europe/Amsterdam
|
||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||
|
||||
if [ "$1" = "" ]
|
||||
then
|
||||
ds=`ISODate -7`
|
||||
else
|
||||
case "$1" in
|
||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
||||
ds=$1
|
||||
;;
|
||||
*)
|
||||
echo INVALID
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
fi
|
||||
|
||||
dp=${ds//-//}
|
||||
|
||||
corpus=/net/corpora/nlnieuws/GG/corpus/$ds
|
||||
|
||||
cd /net/corpora/nlnieuws/GG/$dp
|
||||
|
||||
ln -s lock.$$ lock
|
||||
if [ "`readlink lock`" != lock.$$ ]
|
||||
then
|
||||
echo Getting lock failed
|
||||
exit 1
|
||||
fi
|
||||
|
||||
rm -fr out
|
||||
mkdir out
|
||||
|
||||
rm -f $corpus.lines
|
||||
for i in *.txt
|
||||
do
|
||||
b=`basename $i .txt`
|
||||
perl -p -e 's/^\s*//; s/^##META.*\n//' $i | tokenize.sh \
|
||||
| perl -e '$n = 0; while(<>) { $n++; print("gg.'$b'.$n|$_"); }' \
|
||||
>> $corpus.lines
|
||||
done
|
||||
|
||||
cd out
|
||||
mkdir xml
|
||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||
|
||||
../../../metadata 2> err
|
||||
rm err
|
||||
|
||||
cd xml
|
||||
alto -o $corpus.data.dz *.xml 2> /dev/null
|
||||
|
||||
cd ../..
|
||||
rm -fr out
|
||||
|
||||
rm -f lock
|
||||
18
Makefile
Normal file
18
Makefile
Normal file
@@ -0,0 +1,18 @@
|
||||
|
||||
all:
|
||||
make -C AT5
|
||||
make -C GG
|
||||
make -C NieuwsNL
|
||||
make -C NOS
|
||||
make -C NU
|
||||
make -C RO
|
||||
make -C Sargasso
|
||||
make -C Sikkom
|
||||
make -C Tzum
|
||||
make -C VRT
|
||||
make bin/ISOWeek
|
||||
|
||||
bin/ISOWeek: cmd/ISOWeek/*.go
|
||||
go build -o $@ $^
|
||||
|
||||
|
||||
13
NOS/Makefile
Normal file
13
NOS/Makefile
Normal file
@@ -0,0 +1,13 @@
|
||||
all: \
|
||||
json2txt \
|
||||
metadata \
|
||||
nos
|
||||
|
||||
json2txt: cmd/json2txt/*.go
|
||||
go build -o $@ $^
|
||||
|
||||
metadata: cmd/metadata/*.go
|
||||
go build -o $@ $^
|
||||
|
||||
nos: cmd/nos/*.go
|
||||
go build -o $@ $^
|
||||
93
NOS/cmd/json2txt/json2txt.go
Normal file
93
NOS/cmd/json2txt/json2txt.go
Normal file
@@ -0,0 +1,93 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"github.com/pebbe/util"
|
||||
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type Item struct {
|
||||
Title string `json:"name"`
|
||||
Text string `json:"articleBody"`
|
||||
Cats []string `json:"articleSection"`
|
||||
Tags []string `json:"keywords"`
|
||||
}
|
||||
|
||||
var (
|
||||
x = util.CheckErr
|
||||
|
||||
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`)
|
||||
)
|
||||
|
||||
func main() {
|
||||
|
||||
var ds string
|
||||
switch len(os.Args) {
|
||||
case 1:
|
||||
year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
|
||||
ds = fmt.Sprintf("%d-%02d", year, week)
|
||||
case 2:
|
||||
if !reYearWeek.MatchString(os.Args[1]) {
|
||||
x(fmt.Errorf("arg must be yyyy-ww"))
|
||||
}
|
||||
ds = os.Args[1]
|
||||
default:
|
||||
x(fmt.Errorf("too many arguments"))
|
||||
}
|
||||
dp := ds[:4] + "/" + ds[5:]
|
||||
|
||||
x(os.Chdir("/net/corpora/nlnieuws/NOS/" + dp))
|
||||
x(os.MkdirAll("out", 0777))
|
||||
files, err := os.ReadDir(".")
|
||||
x(err)
|
||||
for _, file := range files {
|
||||
filename := file.Name()
|
||||
if !strings.HasSuffix(filename, ".json") {
|
||||
continue
|
||||
}
|
||||
b, err := os.ReadFile(filename)
|
||||
x(err)
|
||||
fp, err := os.Create("out/" + filename[:len(filename)-5] + ".txt")
|
||||
x(err)
|
||||
var item Item
|
||||
x(json.Unmarshal(b, &item))
|
||||
for _, cat := range item.Cats {
|
||||
_, err = fmt.Fprintf(fp, "##META text cat = %s\n", cat)
|
||||
x(err)
|
||||
}
|
||||
for _, cat := range item.Tags {
|
||||
_, err = fmt.Fprintf(fp, "##META text tag = %s\n", cat)
|
||||
x(err)
|
||||
}
|
||||
_, err = fp.WriteString(addEnd(item.Title))
|
||||
x(err)
|
||||
_, err = fp.WriteString(item.Text)
|
||||
x(err)
|
||||
x(fp.Close())
|
||||
}
|
||||
}
|
||||
|
||||
func addEnd(s string) string {
|
||||
s = strings.TrimSpace(s)
|
||||
n := len(s)
|
||||
if n == 0 {
|
||||
return ""
|
||||
}
|
||||
if n > 0 {
|
||||
if strings.ContainsAny(s[n-1:], ".!?") {
|
||||
return s + "\n"
|
||||
}
|
||||
}
|
||||
if n > 1 {
|
||||
s2 := s[n-2:]
|
||||
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
|
||||
return s + "\n"
|
||||
}
|
||||
}
|
||||
return s + ".\n"
|
||||
}
|
||||
136
NOS/cmd/metadata/metadata.go
Normal file
136
NOS/cmd/metadata/metadata.go
Normal file
@@ -0,0 +1,136 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"github.com/pebbe/util"
|
||||
|
||||
"bufio"
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"html"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type Item struct {
|
||||
XMLName xml.Name `xml:"item"`
|
||||
UnixTime int64 `xml:"unixTime"`
|
||||
}
|
||||
|
||||
var (
|
||||
x = util.CheckErr
|
||||
escape = html.EscapeString
|
||||
data = make(map[string][]string)
|
||||
location *time.Location
|
||||
)
|
||||
|
||||
func main() {
|
||||
var err error
|
||||
location, err = time.LoadLocation("Europe/Amsterdam")
|
||||
x(err)
|
||||
|
||||
files, err := os.ReadDir(".")
|
||||
x(err)
|
||||
for _, file := range files {
|
||||
filename := file.Name()
|
||||
if strings.HasSuffix(filename, ".txt") {
|
||||
doText("", filename)
|
||||
} else if strings.HasSuffix(filename, ".xml") {
|
||||
doXml("", filename)
|
||||
}
|
||||
}
|
||||
files, err = os.ReadDir("..")
|
||||
x(err)
|
||||
for _, file := range files {
|
||||
filename := file.Name()
|
||||
if strings.HasSuffix(filename, ".txt") {
|
||||
doText("../", filename)
|
||||
} else if strings.HasSuffix(filename, ".xml") {
|
||||
doXml("../", filename)
|
||||
}
|
||||
}
|
||||
|
||||
files, err = os.ReadDir("xml")
|
||||
x(err)
|
||||
for _, file := range files {
|
||||
filename := file.Name()
|
||||
if !strings.HasSuffix(filename, ".xml") {
|
||||
continue
|
||||
}
|
||||
aa := strings.Split(filename, ".")
|
||||
base := strings.Join(aa[1:len(aa)-2], ".")
|
||||
b, err := os.ReadFile("xml/" + filename)
|
||||
x(err)
|
||||
s := string(b)
|
||||
i := strings.Index(s, "<alpino") + 1
|
||||
i += strings.Index(s[i:], "<")
|
||||
fp, err := os.Create("xml/" + filename + ".tmp")
|
||||
x(err)
|
||||
_, err = fp.WriteString(s[:i])
|
||||
x(err)
|
||||
_, err = fp.WriteString("<metadata>\n <meta type=\"text\" name=\"source\" value=\"NOS\"/>\n")
|
||||
x(err)
|
||||
for _, m := range data[base] {
|
||||
_, err = fp.WriteString(" " + m + "\n")
|
||||
x(err)
|
||||
}
|
||||
_, err = fp.WriteString(" </metadata>\n ")
|
||||
x(err)
|
||||
_, err = fp.WriteString(stripMeta(s[i:]))
|
||||
x(err)
|
||||
x(fp.Close())
|
||||
x(os.Rename("xml/"+filename+".tmp", "xml/"+filename))
|
||||
}
|
||||
}
|
||||
|
||||
func doText(dirname, filename string) {
|
||||
base := filename[:len(filename)-4]
|
||||
if _, ok := data[base]; !ok {
|
||||
data[base] = make([]string, 0)
|
||||
}
|
||||
fp, err := os.Open(dirname + filename)
|
||||
x(err)
|
||||
defer func() { x(fp.Close()) }()
|
||||
scanner := bufio.NewScanner(fp)
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
if !strings.HasPrefix(line, "##META") {
|
||||
continue
|
||||
}
|
||||
aa := strings.Fields(line)
|
||||
if len(aa) > 4 {
|
||||
data[base] = append(data[base],
|
||||
fmt.Sprintf(`<meta type="%s" name="%s" value="%s"/>`,
|
||||
aa[1],
|
||||
escape(aa[2]),
|
||||
escape(strings.Join(aa[4:], " "))))
|
||||
}
|
||||
}
|
||||
x(scanner.Err())
|
||||
}
|
||||
|
||||
func doXml(dirname, filename string) {
|
||||
base := filename[:len(filename)-4]
|
||||
if _, ok := data[base]; !ok {
|
||||
data[base] = make([]string, 0)
|
||||
}
|
||||
b, err := os.ReadFile(dirname + filename)
|
||||
x(err)
|
||||
var item Item
|
||||
x(xml.Unmarshal(b, &item))
|
||||
t := time.Unix(item.UnixTime, 0).In(location)
|
||||
data[base] = append(data[base],
|
||||
fmt.Sprintf(`<meta type="date" name="pubdate" value="%d-%02d-%02d"/>`,
|
||||
t.Year(),
|
||||
int(t.Month()),
|
||||
t.Day()))
|
||||
}
|
||||
|
||||
func stripMeta(s string) string {
|
||||
i1 := strings.Index(s, "<metadata>")
|
||||
if i1 < 0 {
|
||||
return s
|
||||
}
|
||||
i2 := i1 + strings.Index(s[i1:], "</metadata>") + 11
|
||||
return s[:i1] + strings.TrimLeft(s[i2:], " \t\r\n")
|
||||
}
|
||||
170
NOS/cmd/nos/nos.go
Normal file
170
NOS/cmd/nos/nos.go
Normal file
@@ -0,0 +1,170 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"github.com/pebbe/util"
|
||||
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type Rss struct {
|
||||
XMLName xml.Name `xml:"rss"`
|
||||
Items []ItemT `xml:"channel>item"`
|
||||
}
|
||||
|
||||
type ItemT struct {
|
||||
PubDate string `xml:"pubDate"`
|
||||
UnixTime int64 `xml:"unixTime"`
|
||||
Guid string `xml:"guid"`
|
||||
Link string `xml:"link"`
|
||||
Data []byte `xml:",innerxml"`
|
||||
}
|
||||
|
||||
var (
|
||||
x = util.CheckErr
|
||||
w = util.WarnErr
|
||||
agent = "AhrefsBot/7.0"
|
||||
)
|
||||
|
||||
func exists(filename string) bool {
|
||||
_, err := os.Stat(filename)
|
||||
return err == nil
|
||||
}
|
||||
|
||||
func fileDate(filename string) string {
|
||||
b, err := os.ReadFile(filename)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
s := string(b)
|
||||
i1 := strings.Index(s, "<unixTime>") + 10
|
||||
i2 := strings.Index(s, "</unixTime>")
|
||||
return s[i1:i2]
|
||||
}
|
||||
|
||||
func main() {
|
||||
resp, err := http.Get("https://feeds.nos.nl/nosnieuwsalgemeen")
|
||||
x(err)
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
x(err)
|
||||
x(resp.Body.Close())
|
||||
|
||||
var rss Rss
|
||||
x(xml.Unmarshal(body, &rss))
|
||||
|
||||
if len(rss.Items) == 0 {
|
||||
x(fmt.Errorf("len(rss.Items) == 0"))
|
||||
}
|
||||
|
||||
for _, item := range rss.Items {
|
||||
var t time.Time
|
||||
for _, format := range []string{
|
||||
"Mon, 2 Jan 2006 15:04:05 -0700",
|
||||
"Mon, 2 Jan 2006 15:04:05 MST",
|
||||
time.RFC1123,
|
||||
time.RFC1123Z} {
|
||||
t, err = time.Parse(format, item.PubDate)
|
||||
if err == nil {
|
||||
break
|
||||
}
|
||||
}
|
||||
x(err)
|
||||
year, week := t.ISOWeek()
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/NOS/%d/%02d", year, week)
|
||||
filename := dirname + "/" + url.PathEscape(strings.TrimPrefix(item.Guid, "https://nos.nl/l/"))
|
||||
|
||||
ts := fmt.Sprintf("%d", t.Unix())
|
||||
needUpdate := fileDate(filename+".xml") != ts
|
||||
|
||||
x(os.MkdirAll(dirname, 0777))
|
||||
fp, err := os.Create(filename + ".xml")
|
||||
x(err)
|
||||
_, err = fp.WriteString("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<item>\n")
|
||||
x(err)
|
||||
_, err = fmt.Fprintf(fp, "<unixTime>%d</unixTime>", t.Unix())
|
||||
x(err)
|
||||
_, err = fp.Write(item.Data)
|
||||
x(err)
|
||||
_, err = fp.WriteString("</item>\n")
|
||||
x(err)
|
||||
x(fp.Close())
|
||||
x(os.Chtimes(filename+".xml", t, t))
|
||||
if !doArticle(filename, item.Link, t, needUpdate) {
|
||||
x(os.Remove(filename + ".xml"))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func doArticle(filename string, url string, timestamp time.Time, needUpdate bool) bool {
|
||||
if needUpdate {
|
||||
_ = os.Remove(filename + ".err")
|
||||
_ = os.Remove(filename + ".html")
|
||||
_ = os.Remove(filename + ".skip")
|
||||
_ = os.Remove(filename + ".json")
|
||||
} else {
|
||||
if exists(filename+".json") || exists(filename+".skip") {
|
||||
return true
|
||||
}
|
||||
}
|
||||
time.Sleep(2 * time.Second)
|
||||
|
||||
req, err := http.NewRequest("GET", url, nil)
|
||||
x(err)
|
||||
req.Header.Set("User-Agent", agent)
|
||||
|
||||
client := &http.Client{}
|
||||
resp, err := client.Do(req)
|
||||
x(err)
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
x(err)
|
||||
x(resp.Body.Close())
|
||||
|
||||
s := string(body)
|
||||
|
||||
ok := true
|
||||
i1 := strings.Index(s, `<script type="application/ld+json"`)
|
||||
if i1 < 0 {
|
||||
ok = false
|
||||
} else {
|
||||
i1 += strings.Index(s[i1:], `>`) + 1
|
||||
i2 := i1 + strings.Index(s[i1:], `</script>`)
|
||||
if i2 < i1 {
|
||||
ok = false
|
||||
} else {
|
||||
s = s[i1:i2]
|
||||
}
|
||||
}
|
||||
if !ok {
|
||||
_ = w(fmt.Errorf("script jsonld not found: %s", url))
|
||||
|
||||
fp, err := os.Create(filename + ".err")
|
||||
x(err)
|
||||
_, err = fmt.Fprintf(fp, "script jsonld not found: %s\n", url)
|
||||
x(err)
|
||||
x(fp.Close())
|
||||
x(os.Chtimes(filename+".err", timestamp, timestamp))
|
||||
|
||||
fp, err = os.Create(filename + ".html")
|
||||
x(err)
|
||||
_, err = fp.Write(body)
|
||||
x(err)
|
||||
x(fp.Close())
|
||||
x(os.Chtimes(filename+".html", timestamp, timestamp))
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
fp, err := os.Create(filename + ".json")
|
||||
x(err)
|
||||
_, err = fp.WriteString(s)
|
||||
x(err)
|
||||
x(fp.Close())
|
||||
x(os.Chtimes(filename+".json", timestamp, timestamp))
|
||||
return true
|
||||
}
|
||||
65
NOS/txt2corpus.sh
Executable file
65
NOS/txt2corpus.sh
Executable file
@@ -0,0 +1,65 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
unset CDPATH
|
||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
||||
export TZ=Europe/Amsterdam
|
||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||
|
||||
if [ "$1" = "" ]
|
||||
then
|
||||
ds=`ISODate -7`
|
||||
else
|
||||
case "$1" in
|
||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
||||
ds=$1
|
||||
;;
|
||||
*)
|
||||
echo INVALID
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
fi
|
||||
|
||||
dp=${ds//-//}
|
||||
|
||||
corpus=/net/corpora/nlnieuws/NOS/corpus/$ds
|
||||
|
||||
cd /net/corpora/nlnieuws/NOS/$dp
|
||||
|
||||
ln -s lock.$$ lock
|
||||
if [ "`readlink lock`" != lock.$$ ]
|
||||
then
|
||||
echo Getting lock failed
|
||||
exit 1
|
||||
fi
|
||||
|
||||
rm -fr out
|
||||
mkdir out
|
||||
|
||||
../../json2txt $ds
|
||||
|
||||
rm -f $corpus.lines
|
||||
for i in out/*.txt
|
||||
do
|
||||
b=`basename $i .txt`
|
||||
perl -p -e 's/^\s*//; s/^##META.*\n//' $i | tokenize.sh \
|
||||
| perl -e '$n = 0; while(<>) { $n++; print("nos.'$b'.$n|$_"); }' \
|
||||
>> $corpus.lines
|
||||
done
|
||||
|
||||
cd out
|
||||
mkdir xml
|
||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||
|
||||
../../../metadata 2> err
|
||||
rm err
|
||||
|
||||
cd xml
|
||||
alto -o $corpus.data.dz *.xml 2> /dev/null
|
||||
|
||||
cd ../..
|
||||
rm -fr out
|
||||
|
||||
rm -f lock
|
||||
9
NU/Makefile
Normal file
9
NU/Makefile
Normal file
@@ -0,0 +1,9 @@
|
||||
all: \
|
||||
metadata \
|
||||
nu
|
||||
|
||||
metadata: cmd/metadata/*.go
|
||||
go build -o $@ $^
|
||||
|
||||
nu: cmd/nu/*.go
|
||||
go build -o $@ $^
|
||||
126
NU/cmd/metadata/metadata.go
Normal file
126
NU/cmd/metadata/metadata.go
Normal file
@@ -0,0 +1,126 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"github.com/pebbe/util"
|
||||
|
||||
"bufio"
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"html"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type Item struct {
|
||||
XMLName xml.Name `xml:"item"`
|
||||
UnixTime int64 `xml:"unixTime"`
|
||||
}
|
||||
|
||||
var (
|
||||
x = util.CheckErr
|
||||
escape = html.EscapeString
|
||||
data = make(map[string][]string)
|
||||
location *time.Location
|
||||
)
|
||||
|
||||
func main() {
|
||||
var err error
|
||||
location, err = time.LoadLocation("Europe/Amsterdam")
|
||||
x(err)
|
||||
|
||||
files, err := os.ReadDir("..")
|
||||
x(err)
|
||||
for _, file := range files {
|
||||
filename := file.Name()
|
||||
if strings.HasSuffix(filename, ".txt") {
|
||||
doText(filename)
|
||||
} else if strings.HasSuffix(filename, ".xml") {
|
||||
doXml(filename)
|
||||
}
|
||||
}
|
||||
|
||||
files, err = os.ReadDir("xml")
|
||||
x(err)
|
||||
for _, file := range files {
|
||||
filename := file.Name()
|
||||
if !strings.HasSuffix(filename, ".xml") {
|
||||
continue
|
||||
}
|
||||
aa := strings.Split(filename, ".")
|
||||
base := strings.Join(aa[1:len(aa)-2], ".")
|
||||
b, err := os.ReadFile("xml/" + filename)
|
||||
x(err)
|
||||
s := string(b)
|
||||
i := strings.Index(s, "<alpino") + 1
|
||||
i += strings.Index(s[i:], "<")
|
||||
fp, err := os.Create("xml/" + filename + ".tmp")
|
||||
x(err)
|
||||
_, err = fp.WriteString(s[:i])
|
||||
x(err)
|
||||
_, err = fp.WriteString("<metadata>\n <meta type=\"text\" name=\"source\" value=\"NU\"/>\n")
|
||||
x(err)
|
||||
for _, m := range data[base] {
|
||||
_, err = fp.WriteString(" " + m + "\n")
|
||||
x(err)
|
||||
}
|
||||
_, err = fp.WriteString(" </metadata>\n ")
|
||||
x(err)
|
||||
_, err = fp.WriteString(stripMeta(s[i:]))
|
||||
x(err)
|
||||
x(fp.Close())
|
||||
x(os.Rename("xml/"+filename+".tmp", "xml/"+filename))
|
||||
}
|
||||
}
|
||||
|
||||
func doText(filename string) {
|
||||
base := filename[:len(filename)-4]
|
||||
if _, ok := data[base]; !ok {
|
||||
data[base] = make([]string, 0)
|
||||
}
|
||||
fp, err := os.Open("../" + filename)
|
||||
x(err)
|
||||
defer func() { x(fp.Close()) }()
|
||||
scanner := bufio.NewScanner(fp)
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
if !strings.HasPrefix(line, "##META") {
|
||||
continue
|
||||
}
|
||||
aa := strings.Fields(line)
|
||||
if len(aa) > 4 {
|
||||
data[base] = append(data[base],
|
||||
fmt.Sprintf(`<meta type="%s" name="%s" value="%s"/>`,
|
||||
aa[1],
|
||||
escape(aa[2]),
|
||||
escape(strings.Join(aa[4:], " "))))
|
||||
}
|
||||
}
|
||||
x(scanner.Err())
|
||||
}
|
||||
|
||||
func doXml(filename string) {
|
||||
base := filename[:len(filename)-4]
|
||||
if _, ok := data[base]; !ok {
|
||||
data[base] = make([]string, 0)
|
||||
}
|
||||
b, err := os.ReadFile("../" + filename)
|
||||
x(err)
|
||||
var item Item
|
||||
x(xml.Unmarshal(b, &item))
|
||||
t := time.Unix(item.UnixTime, 0).In(location)
|
||||
data[base] = append(data[base],
|
||||
fmt.Sprintf(`<meta type="date" name="pubdate" value="%d-%02d-%02d"/>`,
|
||||
t.Year(),
|
||||
int(t.Month()),
|
||||
t.Day()))
|
||||
}
|
||||
|
||||
func stripMeta(s string) string {
|
||||
i1 := strings.Index(s, "<metadata>")
|
||||
if i1 < 0 {
|
||||
return s
|
||||
}
|
||||
i2 := i1 + strings.Index(s[i1:], "</metadata>") + 11
|
||||
return s[:i1] + strings.TrimLeft(s[i2:], " \t\r\n")
|
||||
}
|
||||
240
NU/cmd/nu/nu.go
Normal file
240
NU/cmd/nu/nu.go
Normal file
@@ -0,0 +1,240 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"github.com/pebbe/util"
|
||||
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"html"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type Rss struct {
|
||||
XMLName xml.Name `xml:"rss"`
|
||||
Items []ItemT `xml:"channel>item"`
|
||||
}
|
||||
|
||||
type ItemT struct {
|
||||
PubDate string `xml:"pubDate"`
|
||||
UnixTime int64 `xml:"unixTime"`
|
||||
Guid string `xml:"guid"`
|
||||
Link string `xml:"link"`
|
||||
Data []byte `xml:",innerxml"`
|
||||
}
|
||||
|
||||
type Doc struct {
|
||||
Graph []GItem `json:"@graph"`
|
||||
}
|
||||
|
||||
type GItem struct {
|
||||
ArticleBody string `json:"articleBody"`
|
||||
ArticleSection []string `json:"articleSection"`
|
||||
}
|
||||
|
||||
var (
|
||||
x = util.CheckErr
|
||||
w = util.WarnErr
|
||||
agent = "AhrefsBot/7.0"
|
||||
)
|
||||
|
||||
func exists(filename string) bool {
|
||||
_, err := os.Stat(filename)
|
||||
return err == nil
|
||||
}
|
||||
|
||||
func fileDate(filename string) string {
|
||||
b, err := os.ReadFile(filename)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
s := string(b)
|
||||
i1 := strings.Index(s, "<unixTime>") + 10
|
||||
i2 := strings.Index(s, "</unixTime>")
|
||||
return s[i1:i2]
|
||||
}
|
||||
|
||||
func main() {
|
||||
resp, err := http.Get("https://www.nu.nl/rss")
|
||||
x(err)
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
x(err)
|
||||
x(resp.Body.Close())
|
||||
|
||||
var rss Rss
|
||||
x(xml.Unmarshal(body, &rss))
|
||||
|
||||
if len(rss.Items) == 0 {
|
||||
x(fmt.Errorf("len(rss.Items) == 0"))
|
||||
}
|
||||
|
||||
for _, item := range rss.Items {
|
||||
t, err := time.Parse(time.RFC1123Z, item.PubDate)
|
||||
if err != nil {
|
||||
t, err = time.Parse(time.RFC1123, item.PubDate)
|
||||
}
|
||||
x(err)
|
||||
year, week := t.ISOWeek()
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/NU/%d/%02d", year, week)
|
||||
filename := dirname + "/" + url.PathEscape(item.Guid)
|
||||
|
||||
ts := fmt.Sprintf("%d", t.Unix())
|
||||
needUpdate := fileDate(filename+".xml") != ts
|
||||
|
||||
x(os.MkdirAll(dirname, 0777))
|
||||
fp, err := os.Create(filename + ".xml")
|
||||
x(err)
|
||||
_, err = fp.WriteString("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<item>\n")
|
||||
x(err)
|
||||
_, err = fmt.Fprintf(fp, "<unixTime>%d</unixTime>", t.Unix())
|
||||
x(err)
|
||||
_, err = fp.Write(item.Data)
|
||||
x(err)
|
||||
_, err = fp.WriteString("</item>\n")
|
||||
x(err)
|
||||
x(fp.Close())
|
||||
x(os.Chtimes(filename+".xml", t, t))
|
||||
if !doArticle(filename, item.Link, t, needUpdate) {
|
||||
x(os.Remove(filename + ".xml"))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func doArticle(filename string, url string, timestamp time.Time, needUpdate bool) bool {
|
||||
if needUpdate {
|
||||
_ = os.Remove(filename + ".err")
|
||||
_ = os.Remove(filename + ".html")
|
||||
_ = os.Remove(filename + ".json")
|
||||
_ = os.Remove(filename + ".txt")
|
||||
_ = os.Remove(filename + ".skip")
|
||||
} else {
|
||||
// voor sommige berichten is geen .txt, alleen .json
|
||||
if exists(filename+".json") || exists(filename+".skip") {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
time.Sleep(2 * time.Second)
|
||||
|
||||
req, err := http.NewRequest("GET", url, nil)
|
||||
x(err)
|
||||
req.Header.Set("User-Agent", agent)
|
||||
|
||||
client := &http.Client{}
|
||||
resp, err := client.Do(req)
|
||||
x(err)
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
x(err)
|
||||
x(resp.Body.Close())
|
||||
|
||||
s := string(body)
|
||||
ok := true
|
||||
i1 := strings.Index(s, `<script type="application/ld+json"`)
|
||||
if i1 < 0 {
|
||||
ok = false
|
||||
} else {
|
||||
i1 += strings.Index(s[i1:], `>`) + 1
|
||||
i2 := i1 + strings.Index(s[i1:], `</script>`)
|
||||
if i2 < i1 {
|
||||
ok = false
|
||||
} else {
|
||||
s = s[i1:i2]
|
||||
}
|
||||
}
|
||||
if !ok {
|
||||
_ = w(fmt.Errorf("script jsonld not found: %s", url))
|
||||
|
||||
fp, err := os.Create(filename + ".err")
|
||||
x(err)
|
||||
_, err = fmt.Fprintf(fp, "script jsonld not found: %s\n", url)
|
||||
x(err)
|
||||
x(fp.Close())
|
||||
x(os.Chtimes(filename+".err", timestamp, timestamp))
|
||||
|
||||
fp, err = os.Create(filename + ".html")
|
||||
x(err)
|
||||
_, err = fp.Write(body)
|
||||
x(err)
|
||||
x(fp.Close())
|
||||
x(os.Chtimes(filename+".html", timestamp, timestamp))
|
||||
|
||||
return false
|
||||
}
|
||||
fp, err := os.Create(filename + ".json")
|
||||
x(err)
|
||||
_, err = fp.WriteString(s)
|
||||
x(err)
|
||||
x(fp.Close())
|
||||
x(os.Chtimes(filename+".json", timestamp, timestamp))
|
||||
|
||||
var doc Doc
|
||||
if err = json.Unmarshal([]byte(s), &doc); err != nil {
|
||||
_ = w(err, url)
|
||||
|
||||
fp, err := os.Create(filename + ".err")
|
||||
x(err)
|
||||
_, err = fmt.Fprintf(fp, "%s: %v\n", url, err)
|
||||
x(err)
|
||||
x(fp.Close())
|
||||
x(os.Chtimes(filename+".err", timestamp, timestamp))
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
cats := make([]string, 0)
|
||||
var buffer bytes.Buffer
|
||||
for _, i := range doc.Graph {
|
||||
_, err = buffer.WriteString(html.UnescapeString(i.ArticleBody))
|
||||
x(err)
|
||||
cats = append(cats, i.ArticleSection...)
|
||||
}
|
||||
text := buffer.String()
|
||||
|
||||
// sommige berichten bevatten geen tekst, maar een video bijvoorbeeld
|
||||
// dit is geen fout
|
||||
if len(text) > 0 {
|
||||
|
||||
// text bevat kopjes zonder punt aan het eind
|
||||
lines := strings.Split(text, "\n")
|
||||
for i, line := range lines {
|
||||
n := len(line)
|
||||
if n > 0 {
|
||||
if strings.ContainsAny(line[n-1:], ".!?") {
|
||||
continue
|
||||
}
|
||||
}
|
||||
if n > 1 {
|
||||
s := line[n-2:]
|
||||
if s == `."` || s == `!"` || s == `?"` {
|
||||
continue
|
||||
}
|
||||
}
|
||||
lines[i] = line + "."
|
||||
}
|
||||
text = strings.Join(lines, "\n") + "\n"
|
||||
|
||||
fp, err := os.Create(filename + ".txt")
|
||||
x(err)
|
||||
if len(cats) == 0 {
|
||||
_, err := fmt.Fprintln(fp, "##META text cat =")
|
||||
x(err)
|
||||
} else {
|
||||
for _, cat := range cats {
|
||||
_, err := fmt.Fprintf(fp, "##META text cat = %s\n", cat)
|
||||
x(err)
|
||||
}
|
||||
}
|
||||
_, err = fp.WriteString(text)
|
||||
x(err)
|
||||
x(fp.Close())
|
||||
x(os.Chtimes(filename+".txt", timestamp, timestamp))
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
63
NU/txt2corpus.sh
Executable file
63
NU/txt2corpus.sh
Executable file
@@ -0,0 +1,63 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
unset CDPATH
|
||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
||||
export TZ=Europe/Amsterdam
|
||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||
|
||||
if [ "$1" = "" ]
|
||||
then
|
||||
ds=`ISODate -7`
|
||||
else
|
||||
case "$1" in
|
||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
||||
ds=$1
|
||||
;;
|
||||
*)
|
||||
echo INVALID
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
fi
|
||||
|
||||
dp=${ds//-//}
|
||||
|
||||
corpus=/net/corpora/nlnieuws/NU/corpus/$ds
|
||||
|
||||
cd /net/corpora/nlnieuws/NU/$dp
|
||||
|
||||
ln -s lock.$$ lock
|
||||
if [ "`readlink lock`" != lock.$$ ]
|
||||
then
|
||||
echo Getting lock failed
|
||||
exit 1
|
||||
fi
|
||||
|
||||
rm -fr out
|
||||
mkdir out
|
||||
|
||||
rm -f $corpus.lines
|
||||
for i in *.txt
|
||||
do
|
||||
b=`basename $i .txt`
|
||||
perl -p -e 's/^\s*//; s/^##META.*\n//' $i | tokenize.sh \
|
||||
| perl -e '$n = 0; while(<>) { $n++; print("nu.'$b'.$n|$_"); }' \
|
||||
>> $corpus.lines
|
||||
done
|
||||
|
||||
cd out
|
||||
mkdir xml
|
||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||
|
||||
../../../metadata 2> err
|
||||
rm err
|
||||
|
||||
cd xml
|
||||
alto -o $corpus.data.dz *.xml 2> /dev/null
|
||||
|
||||
cd ../..
|
||||
rm -fr out
|
||||
|
||||
rm -f lock
|
||||
9
NieuwsNL/Makefile
Normal file
9
NieuwsNL/Makefile
Normal file
@@ -0,0 +1,9 @@
|
||||
all: \
|
||||
metadata \
|
||||
nieuwsnl
|
||||
|
||||
metadata: cmd/metadata/*.go
|
||||
go build -o $@ $^
|
||||
|
||||
nieuwsnl: cmd/nieuwsnl/*.go
|
||||
go build -o $@ $^
|
||||
126
NieuwsNL/cmd/metadata/metadata.go
Normal file
126
NieuwsNL/cmd/metadata/metadata.go
Normal file
@@ -0,0 +1,126 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"github.com/pebbe/util"
|
||||
|
||||
"bufio"
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"html"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type Item struct {
|
||||
XMLName xml.Name `xml:"item"`
|
||||
UnixTime int64 `xml:"unixTime"`
|
||||
}
|
||||
|
||||
var (
|
||||
x = util.CheckErr
|
||||
escape = html.EscapeString
|
||||
data = make(map[string][]string)
|
||||
location *time.Location
|
||||
)
|
||||
|
||||
func main() {
|
||||
var err error
|
||||
location, err = time.LoadLocation("Europe/Amsterdam")
|
||||
x(err)
|
||||
|
||||
files, err := os.ReadDir("..")
|
||||
x(err)
|
||||
for _, file := range files {
|
||||
filename := file.Name()
|
||||
if strings.HasSuffix(filename, ".txt") {
|
||||
doText(filename)
|
||||
} else if strings.HasSuffix(filename, ".xml") {
|
||||
doXml(filename)
|
||||
}
|
||||
}
|
||||
|
||||
files, err = os.ReadDir("xml")
|
||||
x(err)
|
||||
for _, file := range files {
|
||||
filename := file.Name()
|
||||
if !strings.HasSuffix(filename, ".xml") {
|
||||
continue
|
||||
}
|
||||
aa := strings.Split(filename, ".")
|
||||
base := strings.Join(aa[1:len(aa)-2], ".")
|
||||
b, err := os.ReadFile("xml/" + filename)
|
||||
x(err)
|
||||
s := string(b)
|
||||
i := strings.Index(s, "<alpino") + 1
|
||||
i += strings.Index(s[i:], "<")
|
||||
fp, err := os.Create("xml/" + filename + ".tmp")
|
||||
x(err)
|
||||
_, err = fp.WriteString(s[:i])
|
||||
x(err)
|
||||
_, err = fp.WriteString("<metadata>\n <meta type=\"text\" name=\"source\" value=\"NieuwsNL\"/>\n")
|
||||
x(err)
|
||||
for _, m := range data[base] {
|
||||
_, err = fp.WriteString(" " + m + "\n")
|
||||
x(err)
|
||||
}
|
||||
_, err = fp.WriteString(" </metadata>\n ")
|
||||
x(err)
|
||||
_, err = fp.WriteString(stripMeta(s[i:]))
|
||||
x(err)
|
||||
x(fp.Close())
|
||||
x(os.Rename("xml/"+filename+".tmp", "xml/"+filename))
|
||||
}
|
||||
}
|
||||
|
||||
func doText(filename string) {
|
||||
base := filename[:len(filename)-4]
|
||||
if _, ok := data[base]; !ok {
|
||||
data[base] = make([]string, 0)
|
||||
}
|
||||
fp, err := os.Open("../" + filename)
|
||||
x(err)
|
||||
defer func() { x(fp.Close()) }()
|
||||
scanner := bufio.NewScanner(fp)
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
if !strings.HasPrefix(line, "##META") {
|
||||
continue
|
||||
}
|
||||
aa := strings.Fields(line)
|
||||
if len(aa) > 4 {
|
||||
data[base] = append(data[base],
|
||||
fmt.Sprintf(`<meta type="%s" name="%s" value="%s"/>`,
|
||||
aa[1],
|
||||
escape(aa[2]),
|
||||
escape(strings.Join(aa[4:], " "))))
|
||||
}
|
||||
}
|
||||
x(scanner.Err())
|
||||
}
|
||||
|
||||
func doXml(filename string) {
|
||||
base := filename[:len(filename)-4]
|
||||
if _, ok := data[base]; !ok {
|
||||
data[base] = make([]string, 0)
|
||||
}
|
||||
b, err := os.ReadFile("../" + filename)
|
||||
x(err)
|
||||
var item Item
|
||||
x(xml.Unmarshal(b, &item))
|
||||
t := time.Unix(item.UnixTime, 0).In(location)
|
||||
data[base] = append(data[base],
|
||||
fmt.Sprintf(`<meta type="date" name="pubdate" value="%d-%02d-%02d"/>`,
|
||||
t.Year(),
|
||||
int(t.Month()),
|
||||
t.Day()))
|
||||
}
|
||||
|
||||
func stripMeta(s string) string {
|
||||
i1 := strings.Index(s, "<metadata>")
|
||||
if i1 < 0 {
|
||||
return s
|
||||
}
|
||||
i2 := i1 + strings.Index(s[i1:], "</metadata>") + 11
|
||||
return s[:i1] + strings.TrimLeft(s[i2:], " \t\r\n")
|
||||
}
|
||||
217
NieuwsNL/cmd/nieuwsnl/nieuwsnl.go
Normal file
217
NieuwsNL/cmd/nieuwsnl/nieuwsnl.go
Normal file
@@ -0,0 +1,217 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"github.com/jbowtie/gokogiri"
|
||||
"github.com/pebbe/util"
|
||||
|
||||
"bytes"
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type Rss struct {
|
||||
XMLName xml.Name `xml:"rss"`
|
||||
Items []ItemT `xml:"channel>item"`
|
||||
}
|
||||
|
||||
type ItemT struct {
|
||||
Title string `xml:"title"`
|
||||
PubDate string `xml:"pubDate"`
|
||||
UnixTime int64 `xml:"unixTime"`
|
||||
Guid string `xml:"guid"`
|
||||
Link string `xml:"link"`
|
||||
Data []byte `xml:",innerxml"`
|
||||
}
|
||||
|
||||
var (
|
||||
x = util.CheckErr
|
||||
w = util.WarnErr
|
||||
agent = "AhrefsBot/7.0"
|
||||
)
|
||||
|
||||
func exists(filename string) bool {
|
||||
_, err := os.Stat(filename)
|
||||
return err == nil
|
||||
}
|
||||
|
||||
func fileDate(filename string) string {
|
||||
b, err := os.ReadFile(filename)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
s := string(b)
|
||||
i1 := strings.Index(s, "<unixTime>") + 10
|
||||
i2 := strings.Index(s, "</unixTime>")
|
||||
return s[i1:i2]
|
||||
}
|
||||
|
||||
func main() {
|
||||
req, err := http.NewRequest("GET", "https://nieuws.nl/sitemap/news.xml", nil)
|
||||
x(err)
|
||||
req.Header.Set("User-Agent", agent)
|
||||
|
||||
client := &http.Client{}
|
||||
resp, err := client.Do(req)
|
||||
x(err)
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
x(err)
|
||||
x(resp.Body.Close())
|
||||
|
||||
var rss Rss
|
||||
x(xml.Unmarshal(body, &rss))
|
||||
|
||||
if len(rss.Items) == 0 {
|
||||
x(fmt.Errorf("len(rss.Items) == 0"))
|
||||
}
|
||||
|
||||
for _, item := range rss.Items {
|
||||
t, err := time.Parse(time.RFC1123Z, item.PubDate)
|
||||
if err != nil {
|
||||
t, err = time.Parse(time.RFC1123, item.PubDate)
|
||||
}
|
||||
x(err)
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/NieuwsNL/%d/%02d/%02d", t.Year(), int(t.Month()), t.Day())
|
||||
filename := dirname + "/" + url.PathEscape(strings.TrimPrefix(item.Guid, "urn:uuid:"))
|
||||
|
||||
ts := fmt.Sprintf("%d", t.Unix())
|
||||
needUpdate := fileDate(filename+".xml") != ts
|
||||
|
||||
x(os.MkdirAll(dirname, 0777))
|
||||
fp, err := os.Create(filename + ".xml")
|
||||
x(err)
|
||||
_, err = fp.WriteString("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<item>\n")
|
||||
x(err)
|
||||
_, err = fmt.Fprintf(fp, "<unixTime>%d</unixTime>", t.Unix())
|
||||
x(err)
|
||||
_, err = fp.Write(item.Data)
|
||||
x(err)
|
||||
_, err = fp.WriteString("</item>\n")
|
||||
x(err)
|
||||
x(fp.Close())
|
||||
x(os.Chtimes(filename+".xml", t, t))
|
||||
if !doArticle(filename, item.Link, item.Title, t, needUpdate) {
|
||||
x(os.Remove(filename + ".xml"))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func doArticle(filename string, url string, title string, timestamp time.Time, needUpdate bool) bool {
|
||||
if needUpdate {
|
||||
_ = os.Remove(filename + ".err")
|
||||
_ = os.Remove(filename + ".html")
|
||||
_ = os.Remove(filename + ".txt")
|
||||
_ = os.Remove(filename + ".skip")
|
||||
} else {
|
||||
if exists(filename+".txt") || exists(filename+".skip") {
|
||||
return true
|
||||
}
|
||||
}
|
||||
time.Sleep(2 * time.Second)
|
||||
|
||||
req, err := http.NewRequest("GET", url, nil)
|
||||
x(err)
|
||||
req.Header.Set("User-Agent", agent)
|
||||
|
||||
client := &http.Client{}
|
||||
resp, err := client.Do(req)
|
||||
x(err)
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
x(err)
|
||||
x(resp.Body.Close())
|
||||
|
||||
doc, err := gokogiri.ParseHtml(body)
|
||||
x(err)
|
||||
|
||||
var buf bytes.Buffer
|
||||
fouten := make([]string, 0)
|
||||
|
||||
root := doc.Root()
|
||||
|
||||
var cat string
|
||||
aa, err := root.Search(`//a[contains(@class, "articleHeader__info__category")]`)
|
||||
x(err)
|
||||
if len(aa) == 0 {
|
||||
_, err = fmt.Fprintln(&buf, "##META text cat =")
|
||||
x(err)
|
||||
_ = w(fmt.Errorf("no cat: %s", url))
|
||||
// geen fout, maar waarschuwing als er meer fouten zijn
|
||||
fouten = append(fouten, fmt.Sprintf("no text: %s\n", url))
|
||||
// dus geen return false
|
||||
} else {
|
||||
for _, a := range aa {
|
||||
cat = strings.ReplaceAll(a.Content(), "\n", " ")
|
||||
_, err = fmt.Fprintf(&buf, "##META text cat = %s\n", cat)
|
||||
x(err)
|
||||
}
|
||||
}
|
||||
|
||||
_, err = buf.WriteString(addEnd(title))
|
||||
x(err)
|
||||
|
||||
// oud: //div[@id="article-blocks"]//p
|
||||
pp, err := root.Search(`//div[@id="article-blocks"]//div[contains(@class, "paragraph-content")]`)
|
||||
x(err)
|
||||
if len(pp) == 0 {
|
||||
_ = w(fmt.Errorf("empty: %s", url))
|
||||
// dit is echt fout
|
||||
fouten = append(fouten, fmt.Sprintf("empty: %s\n", url))
|
||||
|
||||
fp, err := os.Create(filename + ".err")
|
||||
x(err)
|
||||
for _, fout := range fouten {
|
||||
_, err = fp.WriteString(fout)
|
||||
x(err)
|
||||
}
|
||||
x(fp.Close())
|
||||
x(os.Chtimes(filename+".err", timestamp, timestamp))
|
||||
|
||||
fp, err = os.Create(filename + ".html")
|
||||
x(err)
|
||||
_, err = fp.Write(body)
|
||||
x(err)
|
||||
x(fp.Close())
|
||||
x(os.Chtimes(filename+".html", timestamp, timestamp))
|
||||
|
||||
return false // echt fout
|
||||
}
|
||||
for _, p := range pp {
|
||||
_, err = buf.WriteString(addEnd(p.Content()))
|
||||
x(err)
|
||||
}
|
||||
|
||||
fp, err := os.Create(filename + ".txt")
|
||||
x(err)
|
||||
_, err = fp.Write(buf.Bytes())
|
||||
x(err)
|
||||
x(fp.Close())
|
||||
|
||||
x(os.Chtimes(filename+".txt", timestamp, timestamp))
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
func addEnd(s string) string {
|
||||
s = strings.TrimSpace(s)
|
||||
n := len(s)
|
||||
if n == 0 {
|
||||
return ""
|
||||
}
|
||||
if n > 0 {
|
||||
if strings.ContainsAny(s[n-1:], ".!?") {
|
||||
return s + "\n"
|
||||
}
|
||||
}
|
||||
if n > 1 {
|
||||
s2 := s[n-2:]
|
||||
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
|
||||
return s + "\n"
|
||||
}
|
||||
}
|
||||
return s + ".\n"
|
||||
}
|
||||
65
NieuwsNL/txt2corpus.sh
Executable file
65
NieuwsNL/txt2corpus.sh
Executable file
@@ -0,0 +1,65 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
unset CDPATH
|
||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
||||
export TZ=Europe/Amsterdam
|
||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||
|
||||
if [ "$1" = "" ]
|
||||
then
|
||||
# nieuws.nl gaat per dag, niet per week
|
||||
# dus gegevens van 2 dagen geleden, niet een week geleden
|
||||
ds=`ISODate -2`
|
||||
else
|
||||
case "$1" in
|
||||
2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9])
|
||||
ds=$1
|
||||
;;
|
||||
*)
|
||||
echo INVALID
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
fi
|
||||
|
||||
dp=${ds//-//}
|
||||
|
||||
corpus=/net/corpora/nlnieuws/NieuwsNL/corpus/$ds
|
||||
|
||||
cd /net/corpora/nlnieuws/NieuwsNL/$dp
|
||||
|
||||
ln -s lock.$$ lock
|
||||
if [ "`readlink lock`" != lock.$$ ]
|
||||
then
|
||||
echo Getting lock failed
|
||||
exit 1
|
||||
fi
|
||||
|
||||
rm -fr out
|
||||
mkdir out
|
||||
|
||||
rm -f $corpus.lines
|
||||
for i in *.txt
|
||||
do
|
||||
b=`basename $i .txt`
|
||||
perl -p -e 's/^\s*//; s/^##META.*\n//' $i | tokenize.sh \
|
||||
| perl -e '$n = 0; while(<>) { $n++; print("nnl.'$b'.$n|$_"); }' \
|
||||
>> $corpus.lines
|
||||
done
|
||||
|
||||
cd out
|
||||
mkdir xml
|
||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||
|
||||
../../../../metadata 2> err
|
||||
rm err
|
||||
|
||||
cd xml
|
||||
alto -o $corpus.data.dz *.xml 2> /dev/null
|
||||
|
||||
cd ../..
|
||||
rm -fr out
|
||||
|
||||
rm -f lock
|
||||
13
RO/Makefile
Normal file
13
RO/Makefile
Normal file
@@ -0,0 +1,13 @@
|
||||
all: \
|
||||
xml2txt \
|
||||
metadata \
|
||||
ro
|
||||
|
||||
xml2txt: cmd/xml2txt/*.go
|
||||
go build -o $@ $^
|
||||
|
||||
metadata: cmd/metadata/*.go
|
||||
go build -o $@ $^
|
||||
|
||||
ro: cmd/ro/*.go
|
||||
go build -o $@ $^
|
||||
136
RO/cmd/metadata/metadata.go
Normal file
136
RO/cmd/metadata/metadata.go
Normal file
@@ -0,0 +1,136 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"github.com/pebbe/util"
|
||||
|
||||
"bufio"
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"html"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type Item struct {
|
||||
XMLName xml.Name `xml:"item"`
|
||||
UnixTime int64 `xml:"unixTime"`
|
||||
}
|
||||
|
||||
var (
|
||||
x = util.CheckErr
|
||||
escape = html.EscapeString
|
||||
data = make(map[string][]string)
|
||||
location *time.Location
|
||||
)
|
||||
|
||||
func main() {
|
||||
var err error
|
||||
location, err = time.LoadLocation("Europe/Amsterdam")
|
||||
x(err)
|
||||
|
||||
files, err := os.ReadDir(".")
|
||||
x(err)
|
||||
for _, file := range files {
|
||||
filename := file.Name()
|
||||
if strings.HasSuffix(filename, ".txt") {
|
||||
doText("", filename)
|
||||
} else if strings.HasSuffix(filename, ".xml") {
|
||||
doXml("", filename)
|
||||
}
|
||||
}
|
||||
files, err = os.ReadDir("..")
|
||||
x(err)
|
||||
for _, file := range files {
|
||||
filename := file.Name()
|
||||
if strings.HasSuffix(filename, ".txt") {
|
||||
doText("../", filename)
|
||||
} else if strings.HasSuffix(filename, ".xml") {
|
||||
doXml("../", filename)
|
||||
}
|
||||
}
|
||||
|
||||
files, err = os.ReadDir("xml")
|
||||
x(err)
|
||||
for _, file := range files {
|
||||
filename := file.Name()
|
||||
if !strings.HasSuffix(filename, ".xml") {
|
||||
continue
|
||||
}
|
||||
aa := strings.Split(filename, ".")
|
||||
base := strings.Join(aa[1:len(aa)-2], ".")
|
||||
b, err := os.ReadFile("xml/" + filename)
|
||||
x(err)
|
||||
s := string(b)
|
||||
i := strings.Index(s, "<alpino") + 1
|
||||
i += strings.Index(s[i:], "<")
|
||||
fp, err := os.Create("xml/" + filename + ".tmp")
|
||||
x(err)
|
||||
_, err = fp.WriteString(s[:i])
|
||||
x(err)
|
||||
_, err = fp.WriteString("<metadata>\n <meta type=\"text\" name=\"source\" value=\"Reporters Online\"/>\n")
|
||||
x(err)
|
||||
for _, m := range data[base] {
|
||||
_, err = fp.WriteString(" " + m + "\n")
|
||||
x(err)
|
||||
}
|
||||
_, err = fp.WriteString(" </metadata>\n ")
|
||||
x(err)
|
||||
_, err = fp.WriteString(stripMeta(s[i:]))
|
||||
x(err)
|
||||
x(fp.Close())
|
||||
x(os.Rename("xml/"+filename+".tmp", "xml/"+filename))
|
||||
}
|
||||
}
|
||||
|
||||
func doText(dirname, filename string) {
|
||||
base := filename[:len(filename)-4]
|
||||
if _, ok := data[base]; !ok {
|
||||
data[base] = make([]string, 0)
|
||||
}
|
||||
fp, err := os.Open(dirname + filename)
|
||||
x(err)
|
||||
defer func() { x(fp.Close()) }()
|
||||
scanner := bufio.NewScanner(fp)
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
if !strings.HasPrefix(line, "##META") {
|
||||
continue
|
||||
}
|
||||
aa := strings.Fields(line)
|
||||
if len(aa) > 4 {
|
||||
data[base] = append(data[base],
|
||||
fmt.Sprintf(`<meta type="%s" name="%s" value="%s"/>`,
|
||||
aa[1],
|
||||
escape(aa[2]),
|
||||
escape(strings.Join(aa[4:], " "))))
|
||||
}
|
||||
}
|
||||
x(scanner.Err())
|
||||
}
|
||||
|
||||
func doXml(dirname, filename string) {
|
||||
base := filename[:len(filename)-4]
|
||||
if _, ok := data[base]; !ok {
|
||||
data[base] = make([]string, 0)
|
||||
}
|
||||
b, err := os.ReadFile(dirname + filename)
|
||||
x(err)
|
||||
var item Item
|
||||
x(xml.Unmarshal(b, &item))
|
||||
t := time.Unix(item.UnixTime, 0).In(location)
|
||||
data[base] = append(data[base],
|
||||
fmt.Sprintf(`<meta type="date" name="pubdate" value="%d-%02d-%02d"/>`,
|
||||
t.Year(),
|
||||
int(t.Month()),
|
||||
t.Day()))
|
||||
}
|
||||
|
||||
func stripMeta(s string) string {
|
||||
i1 := strings.Index(s, "<metadata>")
|
||||
if i1 < 0 {
|
||||
return s
|
||||
}
|
||||
i2 := i1 + strings.Index(s[i1:], "</metadata>") + 11
|
||||
return s[:i1] + strings.TrimLeft(s[i2:], " \t\r\n")
|
||||
}
|
||||
81
RO/cmd/ro/ro.go
Normal file
81
RO/cmd/ro/ro.go
Normal file
@@ -0,0 +1,81 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"github.com/pebbe/util"
|
||||
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type Rss struct {
|
||||
XMLName xml.Name `xml:"rss"`
|
||||
Items []ItemT `xml:"channel>item"`
|
||||
}
|
||||
|
||||
type ItemT struct {
|
||||
PubDate string `xml:"pubDate"`
|
||||
UnixTime int64 `xml:"unixTime"`
|
||||
Guid string `xml:"guid"`
|
||||
Data []byte `xml:",innerxml"`
|
||||
}
|
||||
|
||||
var (
|
||||
x = util.CheckErr
|
||||
agent = "AhrefsBot/7.0"
|
||||
)
|
||||
|
||||
func main() {
|
||||
req, err := http.NewRequest("GET", "https://reportersonline.nl/feed/", nil)
|
||||
x(err)
|
||||
req.Header.Set("User-Agent", agent)
|
||||
|
||||
client := &http.Client{}
|
||||
resp, err := client.Do(req)
|
||||
x(err)
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
x(err)
|
||||
x(resp.Body.Close())
|
||||
|
||||
var rss Rss
|
||||
x(xml.Unmarshal(body, &rss))
|
||||
|
||||
if len(rss.Items) == 0 {
|
||||
x(fmt.Errorf("len(rss.Items) == 0"))
|
||||
}
|
||||
|
||||
for _, item := range rss.Items {
|
||||
t, err := time.Parse(time.RFC1123Z, item.PubDate)
|
||||
if err != nil {
|
||||
t, err = time.Parse(time.RFC1123, item.PubDate)
|
||||
}
|
||||
x(err)
|
||||
year, week := t.ISOWeek()
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/RO/%d/%02d", year, week)
|
||||
basename := strings.TrimPrefix(item.Guid, "https://reportersonline.nl/?p=")
|
||||
if i := strings.LastIndex(basename, "/"); i > 0 {
|
||||
basename = basename[:i]
|
||||
}
|
||||
filename := dirname + "/" + url.PathEscape(basename)
|
||||
|
||||
x(os.MkdirAll(dirname, 0777))
|
||||
fp, err := os.Create(filename + ".xml")
|
||||
x(err)
|
||||
_, err = fp.WriteString("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<item>\n")
|
||||
x(err)
|
||||
_, err = fmt.Fprintf(fp, "<unixTime>%d</unixTime>", t.Unix())
|
||||
x(err)
|
||||
_, err = fp.Write(item.Data)
|
||||
x(err)
|
||||
_, err = fp.WriteString("</item>\n")
|
||||
x(err)
|
||||
x(fp.Close())
|
||||
x(os.Chtimes(filename+".xml", t, t))
|
||||
}
|
||||
|
||||
}
|
||||
102
RO/cmd/xml2txt/xml2txt.go
Normal file
102
RO/cmd/xml2txt/xml2txt.go
Normal file
@@ -0,0 +1,102 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"github.com/jbowtie/gokogiri"
|
||||
"github.com/pebbe/util"
|
||||
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"os"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type Item struct {
|
||||
Title string `xml:"title"`
|
||||
Text string `xml:"encoded"`
|
||||
Cats []string `xml:"category"`
|
||||
}
|
||||
|
||||
var (
|
||||
x = util.CheckErr
|
||||
|
||||
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`)
|
||||
)
|
||||
|
||||
func main() {
|
||||
|
||||
var ds string
|
||||
switch len(os.Args) {
|
||||
case 1:
|
||||
year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
|
||||
ds = fmt.Sprintf("%d-%02d", year, week)
|
||||
case 2:
|
||||
if !reYearWeek.MatchString(os.Args[1]) {
|
||||
x(fmt.Errorf("arg must be yyyy-ww"))
|
||||
}
|
||||
ds = os.Args[1]
|
||||
default:
|
||||
x(fmt.Errorf("too many arguments"))
|
||||
}
|
||||
dp := ds[:4] + "/" + ds[5:]
|
||||
|
||||
x(os.Chdir("/net/corpora/nlnieuws/RO/" + dp))
|
||||
x(os.MkdirAll("out", 0777))
|
||||
files, err := os.ReadDir(".")
|
||||
x(err)
|
||||
for _, file := range files {
|
||||
filename := file.Name()
|
||||
if !strings.HasSuffix(filename, ".xml") {
|
||||
continue
|
||||
}
|
||||
b, err := os.ReadFile(filename)
|
||||
x(err)
|
||||
fp, err := os.Create("out/" + filename[:len(filename)-4] + ".txt")
|
||||
x(err)
|
||||
var item Item
|
||||
x(xml.Unmarshal(b, &item))
|
||||
for _, cat := range item.Cats {
|
||||
_, err = fmt.Fprintf(fp, "##META text cat = %s\n", cat)
|
||||
x(err)
|
||||
}
|
||||
_, err = fp.WriteString(addEnd(item.Title))
|
||||
x(err)
|
||||
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
|
||||
x(err)
|
||||
root := doc.Root()
|
||||
divs, err := root.Search(`//div[@class="donatieformlinks"]`)
|
||||
x(err)
|
||||
for _, div := range divs {
|
||||
div.Remove()
|
||||
}
|
||||
pp, err := root.Search(`//body//p[not(.//a[contains(@href,"reportersonline.nl/support")])]`)
|
||||
x(err)
|
||||
for _, p := range pp {
|
||||
_, err = fp.WriteString(addEnd(p.Content()))
|
||||
x(err)
|
||||
}
|
||||
x(err)
|
||||
x(fp.Close())
|
||||
}
|
||||
}
|
||||
|
||||
func addEnd(s string) string {
|
||||
s = strings.TrimSpace(s)
|
||||
n := len(s)
|
||||
if n == 0 {
|
||||
return ""
|
||||
}
|
||||
if n > 0 {
|
||||
if strings.ContainsAny(s[n-1:], ".!?") {
|
||||
return s + "\n"
|
||||
}
|
||||
}
|
||||
if n > 1 {
|
||||
s2 := s[n-2:]
|
||||
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
|
||||
return s + "\n"
|
||||
}
|
||||
}
|
||||
return s + ".\n"
|
||||
}
|
||||
65
RO/txt2corpus.sh
Executable file
65
RO/txt2corpus.sh
Executable file
@@ -0,0 +1,65 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
unset CDPATH
|
||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
||||
export TZ=Europe/Amsterdam
|
||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||
|
||||
if [ "$1" = "" ]
|
||||
then
|
||||
ds=`ISODate -7`
|
||||
else
|
||||
case "$1" in
|
||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
||||
ds=$1
|
||||
;;
|
||||
*)
|
||||
echo INVALID
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
fi
|
||||
|
||||
dp=${ds//-//}
|
||||
|
||||
corpus=/net/corpora/nlnieuws/RO/corpus/$ds
|
||||
|
||||
cd /net/corpora/nlnieuws/RO/$dp
|
||||
|
||||
ln -s lock.$$ lock
|
||||
if [ "`readlink lock`" != lock.$$ ]
|
||||
then
|
||||
echo Getting lock failed
|
||||
exit 1
|
||||
fi
|
||||
|
||||
rm -fr out
|
||||
mkdir out
|
||||
|
||||
../../xml2txt $ds
|
||||
|
||||
rm -f $corpus.lines
|
||||
for i in out/*.txt
|
||||
do
|
||||
b=`basename $i .txt`
|
||||
perl -p -e 's/^\s*//; s/^##META.*\n//' $i | tokenize.sh \
|
||||
| perl -e '$n = 0; while(<>) { $n++; print("ro.'$b'.$n|$_"); }' \
|
||||
>> $corpus.lines
|
||||
done
|
||||
|
||||
cd out
|
||||
mkdir xml
|
||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||
|
||||
../../../metadata 2> err
|
||||
rm err
|
||||
|
||||
cd xml
|
||||
alto -o $corpus.data.dz *.xml 2> /dev/null
|
||||
|
||||
cd ../..
|
||||
rm -fr out
|
||||
|
||||
rm -f lock
|
||||
13
Sargasso/Makefile
Normal file
13
Sargasso/Makefile
Normal file
@@ -0,0 +1,13 @@
|
||||
all: \
|
||||
xml2txt \
|
||||
metadata \
|
||||
sargasso
|
||||
|
||||
xml2txt: cmd/xml2txt/*.go
|
||||
go build -o $@ $^
|
||||
|
||||
metadata: cmd/metadata/*.go
|
||||
go build -o $@ $^
|
||||
|
||||
sargasso: cmd/sargasso/*.go
|
||||
go build -o $@ $^
|
||||
136
Sargasso/cmd/metadata/metadata.go
Normal file
136
Sargasso/cmd/metadata/metadata.go
Normal file
@@ -0,0 +1,136 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"github.com/pebbe/util"
|
||||
|
||||
"bufio"
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"html"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type Item struct {
|
||||
XMLName xml.Name `xml:"item"`
|
||||
UnixTime int64 `xml:"unixTime"`
|
||||
}
|
||||
|
||||
var (
|
||||
x = util.CheckErr
|
||||
escape = html.EscapeString
|
||||
data = make(map[string][]string)
|
||||
location *time.Location
|
||||
)
|
||||
|
||||
func main() {
|
||||
var err error
|
||||
location, err = time.LoadLocation("Europe/Amsterdam")
|
||||
x(err)
|
||||
|
||||
files, err := os.ReadDir(".")
|
||||
x(err)
|
||||
for _, file := range files {
|
||||
filename := file.Name()
|
||||
if strings.HasSuffix(filename, ".txt") {
|
||||
doText("", filename)
|
||||
} else if strings.HasSuffix(filename, ".xml") {
|
||||
doXml("", filename)
|
||||
}
|
||||
}
|
||||
files, err = os.ReadDir("..")
|
||||
x(err)
|
||||
for _, file := range files {
|
||||
filename := file.Name()
|
||||
if strings.HasSuffix(filename, ".txt") {
|
||||
doText("../", filename)
|
||||
} else if strings.HasSuffix(filename, ".xml") {
|
||||
doXml("../", filename)
|
||||
}
|
||||
}
|
||||
|
||||
files, err = os.ReadDir("xml")
|
||||
x(err)
|
||||
for _, file := range files {
|
||||
filename := file.Name()
|
||||
if !strings.HasSuffix(filename, ".xml") {
|
||||
continue
|
||||
}
|
||||
aa := strings.Split(filename, ".")
|
||||
base := strings.Join(aa[1:len(aa)-2], ".")
|
||||
b, err := os.ReadFile("xml/" + filename)
|
||||
x(err)
|
||||
s := string(b)
|
||||
i := strings.Index(s, "<alpino") + 1
|
||||
i += strings.Index(s[i:], "<")
|
||||
fp, err := os.Create("xml/" + filename + ".tmp")
|
||||
x(err)
|
||||
_, err = fp.WriteString(s[:i])
|
||||
x(err)
|
||||
_, err = fp.WriteString("<metadata>\n <meta type=\"text\" name=\"source\" value=\"Sargasso\"/>\n")
|
||||
x(err)
|
||||
for _, m := range data[base] {
|
||||
_, err = fp.WriteString(" " + m + "\n")
|
||||
x(err)
|
||||
}
|
||||
_, err = fp.WriteString(" </metadata>\n ")
|
||||
x(err)
|
||||
_, err = fp.WriteString(stripMeta(s[i:]))
|
||||
x(err)
|
||||
x(fp.Close())
|
||||
x(os.Rename("xml/"+filename+".tmp", "xml/"+filename))
|
||||
}
|
||||
}
|
||||
|
||||
func doText(dirname, filename string) {
|
||||
base := filename[:len(filename)-4]
|
||||
if _, ok := data[base]; !ok {
|
||||
data[base] = make([]string, 0)
|
||||
}
|
||||
fp, err := os.Open(dirname + filename)
|
||||
x(err)
|
||||
defer func() { x(fp.Close()) }()
|
||||
scanner := bufio.NewScanner(fp)
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
if !strings.HasPrefix(line, "##META") {
|
||||
continue
|
||||
}
|
||||
aa := strings.Fields(line)
|
||||
if len(aa) > 4 {
|
||||
data[base] = append(data[base],
|
||||
fmt.Sprintf(`<meta type="%s" name="%s" value="%s"/>`,
|
||||
aa[1],
|
||||
escape(aa[2]),
|
||||
escape(strings.Join(aa[4:], " "))))
|
||||
}
|
||||
}
|
||||
x(scanner.Err())
|
||||
}
|
||||
|
||||
func doXml(dirname, filename string) {
|
||||
base := filename[:len(filename)-4]
|
||||
if _, ok := data[base]; !ok {
|
||||
data[base] = make([]string, 0)
|
||||
}
|
||||
b, err := os.ReadFile(dirname + filename)
|
||||
x(err)
|
||||
var item Item
|
||||
x(xml.Unmarshal(b, &item))
|
||||
t := time.Unix(item.UnixTime, 0).In(location)
|
||||
data[base] = append(data[base],
|
||||
fmt.Sprintf(`<meta type="date" name="pubdate" value="%d-%02d-%02d"/>`,
|
||||
t.Year(),
|
||||
int(t.Month()),
|
||||
t.Day()))
|
||||
}
|
||||
|
||||
func stripMeta(s string) string {
|
||||
i1 := strings.Index(s, "<metadata>")
|
||||
if i1 < 0 {
|
||||
return s
|
||||
}
|
||||
i2 := i1 + strings.Index(s[i1:], "</metadata>") + 11
|
||||
return s[:i1] + strings.TrimLeft(s[i2:], " \t\r\n")
|
||||
}
|
||||
81
Sargasso/cmd/sargasso/sargasso.go
Normal file
81
Sargasso/cmd/sargasso/sargasso.go
Normal file
@@ -0,0 +1,81 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"github.com/pebbe/util"
|
||||
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type Rss struct {
|
||||
XMLName xml.Name `xml:"rss"`
|
||||
Items []ItemT `xml:"channel>item"`
|
||||
}
|
||||
|
||||
type ItemT struct {
|
||||
PubDate string `xml:"pubDate"`
|
||||
UnixTime int64 `xml:"unixTime"`
|
||||
Guid string `xml:"guid"`
|
||||
Data []byte `xml:",innerxml"`
|
||||
}
|
||||
|
||||
var (
|
||||
x = util.CheckErr
|
||||
agent = "AhrefsBot/7.0"
|
||||
)
|
||||
|
||||
func main() {
|
||||
req, err := http.NewRequest("GET", "https://sargasso.nl/feed/", nil)
|
||||
x(err)
|
||||
req.Header.Set("User-Agent", agent)
|
||||
|
||||
client := &http.Client{}
|
||||
resp, err := client.Do(req)
|
||||
x(err)
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
x(err)
|
||||
x(resp.Body.Close())
|
||||
|
||||
var rss Rss
|
||||
x(xml.Unmarshal(body, &rss))
|
||||
|
||||
if len(rss.Items) == 0 {
|
||||
x(fmt.Errorf("len(rss.Items) == 0"))
|
||||
}
|
||||
|
||||
for _, item := range rss.Items {
|
||||
t, err := time.Parse(time.RFC1123Z, item.PubDate)
|
||||
if err != nil {
|
||||
t, err = time.Parse(time.RFC1123, item.PubDate)
|
||||
}
|
||||
x(err)
|
||||
year, week := t.ISOWeek()
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Sargasso/%d/%02d", year, week)
|
||||
basename := strings.TrimPrefix(item.Guid, "https://sargasso.nl/?")
|
||||
if i := strings.LastIndex(basename, "p="); i >= 0 {
|
||||
basename = basename[i+2:]
|
||||
}
|
||||
filename := dirname + "/" + url.PathEscape(basename)
|
||||
|
||||
x(os.MkdirAll(dirname, 0777))
|
||||
fp, err := os.Create(filename + ".xml")
|
||||
x(err)
|
||||
_, err = fp.WriteString("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<item>\n")
|
||||
x(err)
|
||||
_, err = fmt.Fprintf(fp, "<unixTime>%d</unixTime>", t.Unix())
|
||||
x(err)
|
||||
_, err = fp.Write(item.Data)
|
||||
x(err)
|
||||
_, err = fp.WriteString("</item>\n")
|
||||
x(err)
|
||||
x(fp.Close())
|
||||
x(os.Chtimes(filename+".xml", t, t))
|
||||
}
|
||||
|
||||
}
|
||||
97
Sargasso/cmd/xml2txt/xml2txt.go
Normal file
97
Sargasso/cmd/xml2txt/xml2txt.go
Normal file
@@ -0,0 +1,97 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"github.com/jbowtie/gokogiri"
|
||||
"github.com/pebbe/util"
|
||||
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"os"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type Item struct {
|
||||
Title string `xml:"title"`
|
||||
Text string `xml:"encoded"`
|
||||
Cats []string `xml:"category"`
|
||||
}
|
||||
|
||||
var (
|
||||
x = util.CheckErr
|
||||
|
||||
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`)
|
||||
)
|
||||
|
||||
func main() {
|
||||
|
||||
var ds string
|
||||
switch len(os.Args) {
|
||||
case 1:
|
||||
year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
|
||||
ds = fmt.Sprintf("%d-%02d", year, week)
|
||||
case 2:
|
||||
if !reYearWeek.MatchString(os.Args[1]) {
|
||||
x(fmt.Errorf("arg must be yyyy-ww"))
|
||||
}
|
||||
ds = os.Args[1]
|
||||
default:
|
||||
x(fmt.Errorf("too many arguments"))
|
||||
}
|
||||
dp := ds[:4] + "/" + ds[5:]
|
||||
|
||||
x(os.Chdir("/net/corpora/nlnieuws/Sargasso/" + dp))
|
||||
x(os.MkdirAll("out", 0777))
|
||||
files, err := os.ReadDir(".")
|
||||
x(err)
|
||||
for _, file := range files {
|
||||
filename := file.Name()
|
||||
if !strings.HasSuffix(filename, ".xml") {
|
||||
continue
|
||||
}
|
||||
b, err := os.ReadFile(filename)
|
||||
x(err)
|
||||
fp, err := os.Create("out/" + filename[:len(filename)-4] + ".txt")
|
||||
x(err)
|
||||
var item Item
|
||||
x(xml.Unmarshal(b, &item))
|
||||
for _, cat := range item.Cats {
|
||||
_, err = fmt.Fprintf(fp, "##META text cat = %s\n", cat)
|
||||
x(err)
|
||||
}
|
||||
_, err = fp.WriteString(addEnd(item.Title))
|
||||
x(err)
|
||||
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
|
||||
x(err)
|
||||
root := doc.Root()
|
||||
pp, err := root.Search(`//body//p`)
|
||||
x(err)
|
||||
for _, p := range pp {
|
||||
_, err = fp.WriteString(addEnd(p.Content()))
|
||||
x(err)
|
||||
}
|
||||
x(err)
|
||||
x(fp.Close())
|
||||
}
|
||||
}
|
||||
|
||||
func addEnd(s string) string {
|
||||
s = strings.TrimSpace(s)
|
||||
n := len(s)
|
||||
if n == 0 {
|
||||
return ""
|
||||
}
|
||||
if n > 0 {
|
||||
if strings.ContainsAny(s[n-1:], ".!?") {
|
||||
return s + "\n"
|
||||
}
|
||||
}
|
||||
if n > 1 {
|
||||
s2 := s[n-2:]
|
||||
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
|
||||
return s + "\n"
|
||||
}
|
||||
}
|
||||
return s + ".\n"
|
||||
}
|
||||
65
Sargasso/txt2corpus.sh
Executable file
65
Sargasso/txt2corpus.sh
Executable file
@@ -0,0 +1,65 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
unset CDPATH
|
||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
||||
export TZ=Europe/Amsterdam
|
||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||
|
||||
if [ "$1" = "" ]
|
||||
then
|
||||
ds=`ISODate -7`
|
||||
else
|
||||
case "$1" in
|
||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
||||
ds=$1
|
||||
;;
|
||||
*)
|
||||
echo INVALID
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
fi
|
||||
|
||||
dp=${ds//-//}
|
||||
|
||||
corpus=/net/corpora/nlnieuws/Sargasso/corpus/$ds
|
||||
|
||||
cd /net/corpora/nlnieuws/Sargasso/$dp
|
||||
|
||||
ln -s lock.$$ lock
|
||||
if [ "`readlink lock`" != lock.$$ ]
|
||||
then
|
||||
echo Getting lock failed
|
||||
exit 1
|
||||
fi
|
||||
|
||||
rm -fr out
|
||||
mkdir out
|
||||
|
||||
../../xml2txt $ds
|
||||
|
||||
rm -f $corpus.lines
|
||||
for i in out/*.txt
|
||||
do
|
||||
b=`basename $i .txt`
|
||||
perl -p -e 's/^\s*//; s/^##META.*\n//' $i | tokenize.sh \
|
||||
| perl -e '$n = 0; while(<>) { $n++; print("sargasso.'$b'.$n|$_"); }' \
|
||||
>> $corpus.lines
|
||||
done
|
||||
|
||||
cd out
|
||||
mkdir xml
|
||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||
|
||||
../../../metadata 2> err
|
||||
rm err
|
||||
|
||||
cd xml
|
||||
alto -o $corpus.data.dz *.xml 2> /dev/null
|
||||
|
||||
cd ../..
|
||||
rm -fr out
|
||||
|
||||
rm -f lock
|
||||
9
Sikkom/Makefile
Normal file
9
Sikkom/Makefile
Normal file
@@ -0,0 +1,9 @@
|
||||
all: \
|
||||
metadata \
|
||||
sikkom
|
||||
|
||||
metadata: cmd/metadata/*.go
|
||||
go build -o $@ $^
|
||||
|
||||
sikkom: cmd/sikkom/*.go
|
||||
go build -o $@ $^
|
||||
95
Sikkom/cmd/metadata/metadata.go
Normal file
95
Sikkom/cmd/metadata/metadata.go
Normal file
@@ -0,0 +1,95 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"github.com/pebbe/util"
|
||||
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type Item struct {
|
||||
XMLName xml.Name `xml:"item"`
|
||||
UnixTime int64 `xml:"unixTime"`
|
||||
}
|
||||
|
||||
var (
|
||||
x = util.CheckErr
|
||||
data = make(map[string][]string)
|
||||
location *time.Location
|
||||
)
|
||||
|
||||
func main() {
|
||||
var err error
|
||||
location, err = time.LoadLocation("Europe/Amsterdam")
|
||||
x(err)
|
||||
|
||||
files, err := os.ReadDir("..")
|
||||
x(err)
|
||||
for _, file := range files {
|
||||
filename := file.Name()
|
||||
if strings.HasSuffix(filename, ".xml") {
|
||||
doXml(filename)
|
||||
}
|
||||
}
|
||||
|
||||
files, err = os.ReadDir("xml")
|
||||
x(err)
|
||||
for _, file := range files {
|
||||
filename := file.Name()
|
||||
if !strings.HasSuffix(filename, ".xml") {
|
||||
continue
|
||||
}
|
||||
aa := strings.Split(filename, ".")
|
||||
base := strings.Join(aa[1:len(aa)-2], ".")
|
||||
b, err := os.ReadFile("xml/" + filename)
|
||||
x(err)
|
||||
s := string(b)
|
||||
i := strings.Index(s, "<alpino") + 1
|
||||
i += strings.Index(s[i:], "<")
|
||||
fp, err := os.Create("xml/" + filename + ".tmp")
|
||||
x(err)
|
||||
_, err = fp.WriteString(s[:i])
|
||||
x(err)
|
||||
_, err = fp.WriteString("<metadata>\n <meta type=\"text\" name=\"source\" value=\"Sikkom\"/>\n")
|
||||
x(err)
|
||||
for _, m := range data[base] {
|
||||
_, err = fp.WriteString(" " + m + "\n")
|
||||
x(err)
|
||||
}
|
||||
_, err = fp.WriteString(" </metadata>\n ")
|
||||
x(err)
|
||||
_, err = fp.WriteString(stripMeta(s[i:]))
|
||||
x(err)
|
||||
x(fp.Close())
|
||||
x(os.Rename("xml/"+filename+".tmp", "xml/"+filename))
|
||||
}
|
||||
}
|
||||
|
||||
func doXml(filename string) {
|
||||
base := filename[:len(filename)-4]
|
||||
if _, ok := data[base]; !ok {
|
||||
data[base] = make([]string, 0)
|
||||
}
|
||||
b, err := os.ReadFile("../" + filename)
|
||||
x(err)
|
||||
var item Item
|
||||
x(xml.Unmarshal(b, &item))
|
||||
t := time.Unix(item.UnixTime, 0).In(location)
|
||||
data[base] = append(data[base],
|
||||
fmt.Sprintf(`<meta type="date" name="pubdate" value="%d-%02d-%02d"/>`,
|
||||
t.Year(),
|
||||
int(t.Month()),
|
||||
t.Day()))
|
||||
}
|
||||
|
||||
func stripMeta(s string) string {
|
||||
i1 := strings.Index(s, "<metadata>")
|
||||
if i1 < 0 {
|
||||
return s
|
||||
}
|
||||
i2 := i1 + strings.Index(s[i1:], "</metadata>") + 11
|
||||
return s[:i1] + strings.TrimLeft(s[i2:], " \t\r\n")
|
||||
}
|
||||
228
Sikkom/cmd/sikkom/sikkom.go
Normal file
228
Sikkom/cmd/sikkom/sikkom.go
Normal file
@@ -0,0 +1,228 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"github.com/jbowtie/gokogiri"
|
||||
"github.com/pebbe/util"
|
||||
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type Rss struct {
|
||||
XMLName xml.Name `xml:"rss"`
|
||||
Items []ItemT `xml:"channel>item"`
|
||||
}
|
||||
|
||||
type ItemT struct {
|
||||
Title string `xml:"title"`
|
||||
PubDate string `xml:"pubDate"`
|
||||
UnixTime int64 `xml:"unixTime"`
|
||||
Guid string `xml:"guid"`
|
||||
Link string `xml:"link"`
|
||||
Data []byte `xml:",innerxml"`
|
||||
}
|
||||
|
||||
var (
|
||||
x = util.CheckErr
|
||||
w = util.WarnErr
|
||||
agent = "AhrefsBot/7.0"
|
||||
)
|
||||
|
||||
func exists(filename string) bool {
|
||||
_, err := os.Stat(filename)
|
||||
return err == nil
|
||||
}
|
||||
|
||||
func fileDate(filename string) string {
|
||||
b, err := os.ReadFile(filename)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
s := string(b)
|
||||
i1 := strings.Index(s, "<unixTime>") + 10
|
||||
i2 := strings.Index(s, "</unixTime>")
|
||||
return s[i1:i2]
|
||||
}
|
||||
|
||||
func main() {
|
||||
resp, err := http.Get("https://www.sikkom.nl/api/feed/rss")
|
||||
x(err)
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
x(err)
|
||||
x(resp.Body.Close())
|
||||
|
||||
var rss Rss
|
||||
x(xml.Unmarshal(body, &rss))
|
||||
|
||||
if len(rss.Items) == 0 {
|
||||
x(fmt.Errorf("len(rss.Items) == 0"))
|
||||
}
|
||||
|
||||
for _, item := range rss.Items {
|
||||
t, err := time.Parse(time.RFC1123Z, item.PubDate)
|
||||
if err != nil {
|
||||
t, err = time.Parse(time.RFC1123, item.PubDate)
|
||||
}
|
||||
x(err)
|
||||
year, week := t.ISOWeek()
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Sikkom/%d/%02d", year, week)
|
||||
filename := dirname + "/" + url.PathEscape(item.Guid)
|
||||
|
||||
ts := fmt.Sprintf("%d", t.Unix())
|
||||
needUpdate := fileDate(filename+".xml") != ts
|
||||
|
||||
x(os.MkdirAll(dirname, 0777))
|
||||
fp, err := os.Create(filename + ".xml")
|
||||
x(err)
|
||||
_, err = fp.WriteString("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<item>\n")
|
||||
x(err)
|
||||
_, err = fmt.Fprintf(fp, "<unixTime>%d</unixTime>", t.Unix())
|
||||
x(err)
|
||||
_, err = fp.Write(item.Data)
|
||||
x(err)
|
||||
_, err = fp.WriteString("</item>\n")
|
||||
x(err)
|
||||
x(fp.Close())
|
||||
x(os.Chtimes(filename+".xml", t, t))
|
||||
if !doArticle(filename, item.Link, item.Title, t, needUpdate) {
|
||||
x(os.Remove(filename + ".xml"))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func doArticle(filename string, url string, title string, timestamp time.Time, needUpdate bool) bool {
|
||||
if needUpdate {
|
||||
_ = os.Remove(filename + ".err")
|
||||
_ = os.Remove(filename + ".html")
|
||||
_ = os.Remove(filename + ".skip")
|
||||
_ = os.Remove(filename + ".json")
|
||||
_ = os.Remove(filename + ".txt")
|
||||
} else {
|
||||
if (exists(filename+".json") && exists(filename+".txt")) || exists(filename+".skip") {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
time.Sleep(2 * time.Second)
|
||||
|
||||
req, err := http.NewRequest("GET", url, nil)
|
||||
x(err)
|
||||
req.Header.Set("User-Agent", agent)
|
||||
|
||||
client := &http.Client{}
|
||||
resp, err := client.Do(req)
|
||||
x(err)
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
x(err)
|
||||
x(resp.Body.Close())
|
||||
|
||||
s := string(body)
|
||||
|
||||
ok := true
|
||||
i1 := strings.Index(s, `"application/ld+json"`)
|
||||
if i1 < 0 {
|
||||
ok = false
|
||||
} else {
|
||||
i1 += strings.Index(s[i1:], `>`) + 1
|
||||
i2 := i1 + strings.Index(s[i1:], `</script>`)
|
||||
if i2 < i1 {
|
||||
ok = false
|
||||
} else {
|
||||
s = s[i1:i2]
|
||||
}
|
||||
}
|
||||
if !ok {
|
||||
_ = w(fmt.Errorf("script jsonld not found: %s", url))
|
||||
|
||||
fp, err := os.Create(filename + ".err")
|
||||
x(err)
|
||||
_, err = fmt.Fprintf(fp, "script jsonld not found: %s\n", url)
|
||||
x(err)
|
||||
x(fp.Close())
|
||||
x(os.Chtimes(filename+".err", timestamp, timestamp))
|
||||
|
||||
fp, err = os.Create(filename + ".html")
|
||||
x(err)
|
||||
_, err = fp.Write(body)
|
||||
x(err)
|
||||
x(fp.Close())
|
||||
x(os.Chtimes(filename+".html", timestamp, timestamp))
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
fp, err := os.Create(filename + ".json")
|
||||
x(err)
|
||||
_, err = fp.WriteString(s)
|
||||
x(err)
|
||||
x(fp.Close())
|
||||
x(os.Chtimes(filename+".json", timestamp, timestamp))
|
||||
|
||||
doc, err := gokogiri.ParseHtml(body)
|
||||
x(err)
|
||||
|
||||
root := doc.Root()
|
||||
|
||||
pp, err := root.Search(`//div[contains(@class,"article-page__body")]//p`)
|
||||
x(err)
|
||||
|
||||
if len(pp) == 0 {
|
||||
_ = w(fmt.Errorf("empty: %s", url))
|
||||
|
||||
fp, err := os.Create(filename + ".err")
|
||||
x(err)
|
||||
_, err = fmt.Fprintf(fp, "empty: %s\n", url)
|
||||
x(err)
|
||||
x(fp.Close())
|
||||
x(os.Chtimes(filename+".err", timestamp, timestamp))
|
||||
|
||||
fp, err = os.Create(filename + ".html")
|
||||
x(err)
|
||||
_, err = fp.Write(body)
|
||||
x(err)
|
||||
x(fp.Close())
|
||||
x(os.Chtimes(filename+".html", timestamp, timestamp))
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
fp, err = os.Create(filename + ".txt")
|
||||
x(err)
|
||||
|
||||
_, err = fp.WriteString(addEnd(title))
|
||||
x(err)
|
||||
|
||||
for _, p := range pp {
|
||||
_, err = fp.WriteString(addEnd(p.Content()))
|
||||
x(err)
|
||||
}
|
||||
|
||||
x(fp.Close())
|
||||
return true
|
||||
}
|
||||
|
||||
func addEnd(s string) string {
|
||||
s = strings.TrimSpace(s)
|
||||
n := len(s)
|
||||
if n == 0 {
|
||||
return ""
|
||||
}
|
||||
if n > 0 {
|
||||
if strings.ContainsAny(s[n-1:], ".!?") {
|
||||
return s + "\n"
|
||||
}
|
||||
}
|
||||
if n > 1 {
|
||||
s2 := s[n-2:]
|
||||
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
|
||||
return s + "\n"
|
||||
}
|
||||
}
|
||||
return s + ".\n"
|
||||
}
|
||||
63
Sikkom/txt2corpus.sh
Executable file
63
Sikkom/txt2corpus.sh
Executable file
@@ -0,0 +1,63 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
unset CDPATH
|
||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
||||
export TZ=Europe/Amsterdam
|
||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||
|
||||
if [ "$1" = "" ]
|
||||
then
|
||||
ds=`ISODate -7`
|
||||
else
|
||||
case "$1" in
|
||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
||||
ds=$1
|
||||
;;
|
||||
*)
|
||||
echo INVALID
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
fi
|
||||
|
||||
dp=${ds//-//}
|
||||
|
||||
corpus=/net/corpora/nlnieuws/Sikkom/corpus/$ds
|
||||
|
||||
cd /net/corpora/nlnieuws/Sikkom/$dp
|
||||
|
||||
ln -s lock.$$ lock
|
||||
if [ "`readlink lock`" != lock.$$ ]
|
||||
then
|
||||
echo Getting lock failed
|
||||
exit 1
|
||||
fi
|
||||
|
||||
rm -fr out
|
||||
mkdir out
|
||||
|
||||
rm -f $corpus.lines
|
||||
for i in *.txt
|
||||
do
|
||||
b=`basename $i .txt`
|
||||
perl -p -e 's/^\s*//; s/^##META.*\n//' $i | tokenize.sh \
|
||||
| perl -e '$n = 0; while(<>) { $n++; print("sikkom.'$b'.$n|$_"); }' \
|
||||
>> $corpus.lines
|
||||
done
|
||||
|
||||
cd out
|
||||
mkdir xml
|
||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||
|
||||
../../../metadata 2> err
|
||||
rm err
|
||||
|
||||
cd xml
|
||||
alto -o $corpus.data.dz *.xml 2> /dev/null
|
||||
|
||||
cd ../..
|
||||
rm -fr out
|
||||
|
||||
rm -f lock
|
||||
13
Tzum/Makefile
Normal file
13
Tzum/Makefile
Normal file
@@ -0,0 +1,13 @@
|
||||
all: \
|
||||
xml2txt \
|
||||
metadata \
|
||||
tzum
|
||||
|
||||
xml2txt: cmd/xml2txt/*.go
|
||||
go build -o $@ $^
|
||||
|
||||
metadata: cmd/metadata/*.go
|
||||
go build -o $@ $^
|
||||
|
||||
tzum: cmd/tzum/*.go
|
||||
go build -o $@ $^
|
||||
136
Tzum/cmd/metadata/metadata.go
Normal file
136
Tzum/cmd/metadata/metadata.go
Normal file
@@ -0,0 +1,136 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"github.com/pebbe/util"
|
||||
|
||||
"bufio"
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"html"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type Item struct {
|
||||
XMLName xml.Name `xml:"item"`
|
||||
UnixTime int64 `xml:"unixTime"`
|
||||
}
|
||||
|
||||
var (
|
||||
x = util.CheckErr
|
||||
escape = html.EscapeString
|
||||
data = make(map[string][]string)
|
||||
location *time.Location
|
||||
)
|
||||
|
||||
func main() {
|
||||
var err error
|
||||
location, err = time.LoadLocation("Europe/Amsterdam")
|
||||
x(err)
|
||||
|
||||
files, err := os.ReadDir(".")
|
||||
x(err)
|
||||
for _, file := range files {
|
||||
filename := file.Name()
|
||||
if strings.HasSuffix(filename, ".txt") {
|
||||
doText("", filename)
|
||||
} else if strings.HasSuffix(filename, ".xml") {
|
||||
doXml("", filename)
|
||||
}
|
||||
}
|
||||
files, err = os.ReadDir("..")
|
||||
x(err)
|
||||
for _, file := range files {
|
||||
filename := file.Name()
|
||||
if strings.HasSuffix(filename, ".txt") {
|
||||
doText("../", filename)
|
||||
} else if strings.HasSuffix(filename, ".xml") {
|
||||
doXml("../", filename)
|
||||
}
|
||||
}
|
||||
|
||||
files, err = os.ReadDir("xml")
|
||||
x(err)
|
||||
for _, file := range files {
|
||||
filename := file.Name()
|
||||
if !strings.HasSuffix(filename, ".xml") {
|
||||
continue
|
||||
}
|
||||
aa := strings.Split(filename, ".")
|
||||
base := strings.Join(aa[1:len(aa)-2], ".")
|
||||
b, err := os.ReadFile("xml/" + filename)
|
||||
x(err)
|
||||
s := string(b)
|
||||
i := strings.Index(s, "<alpino") + 1
|
||||
i += strings.Index(s[i:], "<")
|
||||
fp, err := os.Create("xml/" + filename + ".tmp")
|
||||
x(err)
|
||||
_, err = fp.WriteString(s[:i])
|
||||
x(err)
|
||||
_, err = fp.WriteString("<metadata>\n <meta type=\"text\" name=\"source\" value=\"Tzum\"/>\n")
|
||||
x(err)
|
||||
for _, m := range data[base] {
|
||||
_, err = fp.WriteString(" " + m + "\n")
|
||||
x(err)
|
||||
}
|
||||
_, err = fp.WriteString(" </metadata>\n ")
|
||||
x(err)
|
||||
_, err = fp.WriteString(stripMeta(s[i:]))
|
||||
x(err)
|
||||
x(fp.Close())
|
||||
x(os.Rename("xml/"+filename+".tmp", "xml/"+filename))
|
||||
}
|
||||
}
|
||||
|
||||
func doText(dirname, filename string) {
|
||||
base := filename[:len(filename)-4]
|
||||
if _, ok := data[base]; !ok {
|
||||
data[base] = make([]string, 0)
|
||||
}
|
||||
fp, err := os.Open(dirname + filename)
|
||||
x(err)
|
||||
defer func() { x(fp.Close()) }()
|
||||
scanner := bufio.NewScanner(fp)
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
if !strings.HasPrefix(line, "##META") {
|
||||
continue
|
||||
}
|
||||
aa := strings.Fields(line)
|
||||
if len(aa) > 4 {
|
||||
data[base] = append(data[base],
|
||||
fmt.Sprintf(`<meta type="%s" name="%s" value="%s"/>`,
|
||||
aa[1],
|
||||
escape(aa[2]),
|
||||
escape(strings.Join(aa[4:], " "))))
|
||||
}
|
||||
}
|
||||
x(scanner.Err())
|
||||
}
|
||||
|
||||
func doXml(dirname, filename string) {
|
||||
base := filename[:len(filename)-4]
|
||||
if _, ok := data[base]; !ok {
|
||||
data[base] = make([]string, 0)
|
||||
}
|
||||
b, err := os.ReadFile(dirname + filename)
|
||||
x(err)
|
||||
var item Item
|
||||
x(xml.Unmarshal(b, &item))
|
||||
t := time.Unix(item.UnixTime, 0).In(location)
|
||||
data[base] = append(data[base],
|
||||
fmt.Sprintf(`<meta type="date" name="pubdate" value="%d-%02d-%02d"/>`,
|
||||
t.Year(),
|
||||
int(t.Month()),
|
||||
t.Day()))
|
||||
}
|
||||
|
||||
func stripMeta(s string) string {
|
||||
i1 := strings.Index(s, "<metadata>")
|
||||
if i1 < 0 {
|
||||
return s
|
||||
}
|
||||
i2 := i1 + strings.Index(s[i1:], "</metadata>") + 11
|
||||
return s[:i1] + strings.TrimLeft(s[i2:], " \t\r\n")
|
||||
}
|
||||
81
Tzum/cmd/tzum/tzum.go
Normal file
81
Tzum/cmd/tzum/tzum.go
Normal file
@@ -0,0 +1,81 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"github.com/pebbe/util"
|
||||
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type Rss struct {
|
||||
XMLName xml.Name `xml:"rss"`
|
||||
Items []ItemT `xml:"channel>item"`
|
||||
}
|
||||
|
||||
type ItemT struct {
|
||||
PubDate string `xml:"pubDate"`
|
||||
UnixTime int64 `xml:"unixTime"`
|
||||
Guid string `xml:"guid"`
|
||||
Data []byte `xml:",innerxml"`
|
||||
}
|
||||
|
||||
var (
|
||||
x = util.CheckErr
|
||||
agent = "AhrefsBot/7.0"
|
||||
)
|
||||
|
||||
func main() {
|
||||
req, err := http.NewRequest("GET", "https://www.tzum.info/feed/", nil)
|
||||
x(err)
|
||||
req.Header.Set("User-Agent", agent)
|
||||
|
||||
client := &http.Client{}
|
||||
resp, err := client.Do(req)
|
||||
x(err)
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
x(err)
|
||||
x(resp.Body.Close())
|
||||
|
||||
var rss Rss
|
||||
x(xml.Unmarshal(body, &rss))
|
||||
|
||||
if len(rss.Items) == 0 {
|
||||
x(fmt.Errorf("len(rss.Items) == 0"))
|
||||
}
|
||||
|
||||
for _, item := range rss.Items {
|
||||
t, err := time.Parse(time.RFC1123Z, item.PubDate)
|
||||
if err != nil {
|
||||
t, err = time.Parse(time.RFC1123, item.PubDate)
|
||||
}
|
||||
x(err)
|
||||
year, week := t.ISOWeek()
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Tzum/%d/%02d", year, week)
|
||||
basename := strings.TrimPrefix(item.Guid, "https://www.tzum.info/?p=")
|
||||
if i := strings.LastIndex(basename, "/"); i > 0 {
|
||||
basename = basename[:i]
|
||||
}
|
||||
filename := dirname + "/" + url.PathEscape(basename)
|
||||
|
||||
x(os.MkdirAll(dirname, 0777))
|
||||
fp, err := os.Create(filename + ".xml")
|
||||
x(err)
|
||||
_, err = fp.WriteString("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<item>\n")
|
||||
x(err)
|
||||
_, err = fmt.Fprintf(fp, "<unixTime>%d</unixTime>", t.Unix())
|
||||
x(err)
|
||||
_, err = fp.Write(item.Data)
|
||||
x(err)
|
||||
_, err = fp.WriteString("</item>\n")
|
||||
x(err)
|
||||
x(fp.Close())
|
||||
x(os.Chtimes(filename+".xml", t, t))
|
||||
}
|
||||
|
||||
}
|
||||
100
Tzum/cmd/xml2txt/xml2txt.go
Normal file
100
Tzum/cmd/xml2txt/xml2txt.go
Normal file
@@ -0,0 +1,100 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"github.com/jbowtie/gokogiri"
|
||||
"github.com/pebbe/util"
|
||||
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"os"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type Item struct {
|
||||
Title string `xml:"title"`
|
||||
Text string `xml:"encoded"`
|
||||
Cats []string `xml:"category"`
|
||||
}
|
||||
|
||||
var (
|
||||
x = util.CheckErr
|
||||
|
||||
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`)
|
||||
)
|
||||
|
||||
func main() {
|
||||
|
||||
var ds string
|
||||
switch len(os.Args) {
|
||||
case 1:
|
||||
year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
|
||||
ds = fmt.Sprintf("%d-%02d", year, week)
|
||||
case 2:
|
||||
if !reYearWeek.MatchString(os.Args[1]) {
|
||||
x(fmt.Errorf("arg must be yyyy-ww"))
|
||||
}
|
||||
ds = os.Args[1]
|
||||
default:
|
||||
x(fmt.Errorf("too many arguments"))
|
||||
}
|
||||
dp := ds[:4] + "/" + ds[5:]
|
||||
|
||||
x(os.Chdir("/net/corpora/nlnieuws/Tzum/" + dp))
|
||||
x(os.MkdirAll("out", 0777))
|
||||
files, err := os.ReadDir(".")
|
||||
x(err)
|
||||
for _, file := range files {
|
||||
filename := file.Name()
|
||||
if !strings.HasSuffix(filename, ".xml") {
|
||||
continue
|
||||
}
|
||||
b, err := os.ReadFile(filename)
|
||||
x(err)
|
||||
fp, err := os.Create("out/" + filename[:len(filename)-4] + ".txt")
|
||||
x(err)
|
||||
var item Item
|
||||
x(xml.Unmarshal(b, &item))
|
||||
for _, cat := range item.Cats {
|
||||
_, err = fmt.Fprintf(fp, "##META text cat = %s\n", cat)
|
||||
x(err)
|
||||
}
|
||||
_, err = fp.WriteString(addEnd(item.Title))
|
||||
x(err)
|
||||
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
|
||||
x(err)
|
||||
root := doc.Root()
|
||||
pp, err := root.Search(`//body/p`)
|
||||
x(err)
|
||||
for _, p := range pp {
|
||||
s := p.Content()
|
||||
if !strings.Contains(s, "verscheen eerst op Tzum.") {
|
||||
_, err = fp.WriteString(addEnd(p.Content()))
|
||||
x(err)
|
||||
}
|
||||
}
|
||||
x(err)
|
||||
x(fp.Close())
|
||||
}
|
||||
}
|
||||
|
||||
func addEnd(s string) string {
|
||||
s = strings.TrimSpace(s)
|
||||
n := len(s)
|
||||
if n == 0 {
|
||||
return ""
|
||||
}
|
||||
if n > 0 {
|
||||
if strings.ContainsAny(s[n-1:], ".!?") {
|
||||
return s + "\n"
|
||||
}
|
||||
}
|
||||
if n > 1 {
|
||||
s2 := s[n-2:]
|
||||
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
|
||||
return s + "\n"
|
||||
}
|
||||
}
|
||||
return s + ".\n"
|
||||
}
|
||||
65
Tzum/txt2corpus.sh
Executable file
65
Tzum/txt2corpus.sh
Executable file
@@ -0,0 +1,65 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
unset CDPATH
|
||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
||||
export TZ=Europe/Amsterdam
|
||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||
|
||||
if [ "$1" = "" ]
|
||||
then
|
||||
ds=`ISODate -7`
|
||||
else
|
||||
case "$1" in
|
||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
||||
ds=$1
|
||||
;;
|
||||
*)
|
||||
echo INVALID
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
fi
|
||||
|
||||
dp=${ds//-//}
|
||||
|
||||
corpus=/net/corpora/nlnieuws/Tzum/corpus/$ds
|
||||
|
||||
cd /net/corpora/nlnieuws/Tzum/$dp
|
||||
|
||||
ln -s lock.$$ lock
|
||||
if [ "`readlink lock`" != lock.$$ ]
|
||||
then
|
||||
echo Getting lock failed
|
||||
exit 1
|
||||
fi
|
||||
|
||||
rm -fr out
|
||||
mkdir out
|
||||
|
||||
../../xml2txt $ds
|
||||
|
||||
rm -f $corpus.lines
|
||||
for i in out/*.txt
|
||||
do
|
||||
b=`basename $i .txt`
|
||||
perl -p -e 's/^\s*//; s/^##META.*\n//' $i | tokenize.sh \
|
||||
| perl -e '$n = 0; while(<>) { $n++; print("tzum.'$b'.$n|$_"); }' \
|
||||
>> $corpus.lines
|
||||
done
|
||||
|
||||
cd out
|
||||
mkdir xml
|
||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||
|
||||
../../../metadata 2> err
|
||||
rm err
|
||||
|
||||
cd xml
|
||||
alto -o $corpus.data.dz *.xml 2> /dev/null
|
||||
|
||||
cd ../..
|
||||
rm -fr out
|
||||
|
||||
rm -f lock
|
||||
9
VRT/Makefile
Normal file
9
VRT/Makefile
Normal file
@@ -0,0 +1,9 @@
|
||||
all: \
|
||||
metadata \
|
||||
vrt
|
||||
|
||||
metadata: cmd/metadata/*.go
|
||||
go build -o $@ $^
|
||||
|
||||
vrt: cmd/vrt/*.go
|
||||
go build -o $@ $^
|
||||
126
VRT/cmd/metadata/metadata.go
Normal file
126
VRT/cmd/metadata/metadata.go
Normal file
@@ -0,0 +1,126 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"github.com/pebbe/util"
|
||||
|
||||
"bufio"
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"html"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type Item struct {
|
||||
XMLName xml.Name `xml:"item"`
|
||||
UnixTime int64 `xml:"unixTime"`
|
||||
}
|
||||
|
||||
var (
|
||||
x = util.CheckErr
|
||||
escape = html.EscapeString
|
||||
data = make(map[string][]string)
|
||||
location *time.Location
|
||||
)
|
||||
|
||||
func main() {
|
||||
var err error
|
||||
location, err = time.LoadLocation("Europe/Amsterdam")
|
||||
x(err)
|
||||
|
||||
files, err := os.ReadDir("..")
|
||||
x(err)
|
||||
for _, file := range files {
|
||||
filename := file.Name()
|
||||
if strings.HasSuffix(filename, ".txt") {
|
||||
doText(filename)
|
||||
} else if strings.HasSuffix(filename, ".xml") {
|
||||
doXml(filename)
|
||||
}
|
||||
}
|
||||
|
||||
files, err = os.ReadDir("xml")
|
||||
x(err)
|
||||
for _, file := range files {
|
||||
filename := file.Name()
|
||||
if !strings.HasSuffix(filename, ".xml") {
|
||||
continue
|
||||
}
|
||||
aa := strings.Split(filename, ".")
|
||||
base := strings.Join(aa[1:len(aa)-2], ".")
|
||||
b, err := os.ReadFile("xml/" + filename)
|
||||
x(err)
|
||||
s := string(b)
|
||||
i := strings.Index(s, "<alpino") + 1
|
||||
i += strings.Index(s[i:], "<")
|
||||
fp, err := os.Create("xml/" + filename + ".tmp")
|
||||
x(err)
|
||||
_, err = fp.WriteString(s[:i])
|
||||
x(err)
|
||||
_, err = fp.WriteString("<metadata>\n <meta type=\"text\" name=\"source\" value=\"VRT\"/>\n")
|
||||
x(err)
|
||||
for _, m := range data[base] {
|
||||
_, err = fp.WriteString(" " + m + "\n")
|
||||
x(err)
|
||||
}
|
||||
_, err = fp.WriteString(" </metadata>\n ")
|
||||
x(err)
|
||||
_, err = fp.WriteString(stripMeta(s[i:]))
|
||||
x(err)
|
||||
x(fp.Close())
|
||||
x(os.Rename("xml/"+filename+".tmp", "xml/"+filename))
|
||||
}
|
||||
}
|
||||
|
||||
func doText(filename string) {
|
||||
base := filename[:len(filename)-4]
|
||||
if _, ok := data[base]; !ok {
|
||||
data[base] = make([]string, 0)
|
||||
}
|
||||
fp, err := os.Open("../" + filename)
|
||||
x(err)
|
||||
defer func() { x(fp.Close()) }()
|
||||
scanner := bufio.NewScanner(fp)
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
if !strings.HasPrefix(line, "##META") {
|
||||
continue
|
||||
}
|
||||
aa := strings.Fields(line)
|
||||
if len(aa) > 4 {
|
||||
data[base] = append(data[base],
|
||||
fmt.Sprintf(`<meta type="%s" name="%s" value="%s"/>`,
|
||||
aa[1],
|
||||
escape(aa[2]),
|
||||
escape(strings.Join(aa[4:], " "))))
|
||||
}
|
||||
}
|
||||
x(scanner.Err())
|
||||
}
|
||||
|
||||
func doXml(filename string) {
|
||||
base := filename[:len(filename)-4]
|
||||
if _, ok := data[base]; !ok {
|
||||
data[base] = make([]string, 0)
|
||||
}
|
||||
b, err := os.ReadFile("../" + filename)
|
||||
x(err)
|
||||
var item Item
|
||||
x(xml.Unmarshal(b, &item))
|
||||
t := time.Unix(item.UnixTime, 0).In(location)
|
||||
data[base] = append(data[base],
|
||||
fmt.Sprintf(`<meta type="date" name="pubdate" value="%d-%02d-%02d"/>`,
|
||||
t.Year(),
|
||||
int(t.Month()),
|
||||
t.Day()))
|
||||
}
|
||||
|
||||
func stripMeta(s string) string {
|
||||
i1 := strings.Index(s, "<metadata>")
|
||||
if i1 < 0 {
|
||||
return s
|
||||
}
|
||||
i2 := i1 + strings.Index(s[i1:], "</metadata>") + 11
|
||||
return s[:i1] + strings.TrimLeft(s[i2:], " \t\r\n")
|
||||
}
|
||||
317
VRT/cmd/vrt/vrt.go
Normal file
317
VRT/cmd/vrt/vrt.go
Normal file
@@ -0,0 +1,317 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"github.com/jbowtie/gokogiri"
|
||||
"github.com/pebbe/util"
|
||||
|
||||
"bytes"
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type Rss struct {
|
||||
XMLName xml.Name `xml:"feed"`
|
||||
Items []ItemT `xml:"entry"`
|
||||
}
|
||||
|
||||
type ItemT struct {
|
||||
Title TitleT `xml:"title"`
|
||||
Published string `xml:"published"`
|
||||
Updated string `xml:"updated"`
|
||||
Nstag []string `xml:"nstag"`
|
||||
Nslabeltag []string `xml:"nslabeltag"`
|
||||
UnixTime int64 `xml:"unixTime"`
|
||||
ID string `xml:"id"`
|
||||
Link []LinkT `xml:"link"`
|
||||
Data []byte `xml:",innerxml"`
|
||||
}
|
||||
|
||||
type TitleT struct {
|
||||
Type string `xml:"type,attr"`
|
||||
Text string `xml:",chardata"`
|
||||
}
|
||||
|
||||
type LinkT struct {
|
||||
Type string `xml:"type,attr"`
|
||||
Href string `xml:"href,attr"`
|
||||
}
|
||||
|
||||
var (
|
||||
x = util.CheckErr
|
||||
w = util.WarnErr
|
||||
// agent = "AhrefsBot/7.0"
|
||||
agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36"
|
||||
)
|
||||
|
||||
func exists(filename string) bool {
|
||||
_, err := os.Stat(filename)
|
||||
return err == nil
|
||||
}
|
||||
|
||||
func fileDate(filename string) string {
|
||||
b, err := os.ReadFile(filename)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
s := string(b)
|
||||
i1 := strings.Index(s, "<unixTime>") + 10
|
||||
i2 := strings.Index(s, "</unixTime>")
|
||||
return s[i1:i2]
|
||||
}
|
||||
|
||||
func main() {
|
||||
resp, err := http.Get("https://www.vrt.be/vrtnws/nl.rss.headlines.xml")
|
||||
x(err)
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
x(err)
|
||||
x(resp.Body.Close())
|
||||
|
||||
var rss Rss
|
||||
x(xml.Unmarshal(body, &rss))
|
||||
|
||||
if len(rss.Items) == 0 {
|
||||
x(fmt.Errorf("len(rss.Items) == 0"))
|
||||
}
|
||||
|
||||
for _, item := range rss.Items {
|
||||
t, err := time.Parse(time.RFC3339Nano, item.Published)
|
||||
if err != nil {
|
||||
t, err = time.Parse(time.RFC1123, item.Published)
|
||||
}
|
||||
x(err)
|
||||
t2, err := time.Parse(time.RFC3339Nano, item.Updated)
|
||||
if err != nil {
|
||||
t2, _ = time.Parse(time.RFC1123, item.Updated)
|
||||
}
|
||||
if t2.After(t) {
|
||||
t = t2
|
||||
}
|
||||
year, week := t.ISOWeek()
|
||||
dirname := fmt.Sprintf("/net/corpora/nlnieuws/VRT/%d/%02d", year, week)
|
||||
filename := dirname + "/" + url.PathEscape(strings.TrimPrefix(item.ID, "https://vrtnws.be/"))
|
||||
|
||||
ts := fmt.Sprintf("%d", t.Unix())
|
||||
needUpdate := fileDate(filename+".xml") != ts
|
||||
|
||||
x(os.MkdirAll(dirname, 0777))
|
||||
fp, err := os.Create(filename + ".xml")
|
||||
x(err)
|
||||
_, err = fp.WriteString("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<item>\n")
|
||||
x(err)
|
||||
_, err = fmt.Fprintf(fp, "<unixTime>%d</unixTime>", t.Unix())
|
||||
x(err)
|
||||
_, err = fp.Write(item.Data)
|
||||
x(err)
|
||||
_, err = fp.WriteString("</item>\n")
|
||||
x(err)
|
||||
x(fp.Close())
|
||||
x(os.Chtimes(filename+".xml", t, t))
|
||||
|
||||
var link string
|
||||
for _, l := range item.Link {
|
||||
if l.Type == "text/html" {
|
||||
link = l.Href
|
||||
}
|
||||
}
|
||||
if !doArticle(filename, link, item.Title.Text, item.Nstag, item.Nslabeltag, t, needUpdate) {
|
||||
x(os.Remove(filename + ".xml"))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func doArticle(filename string, url string, title string, tags []string, labels []string, timestamp time.Time, needUpdate bool) bool {
|
||||
if needUpdate {
|
||||
_ = os.Remove(filename + ".err")
|
||||
_ = os.Remove(filename + ".txt")
|
||||
_ = os.Remove(filename + ".html")
|
||||
_ = os.Remove(filename + ".skip")
|
||||
} else {
|
||||
if exists(filename+".txt") || exists(filename+".skip") {
|
||||
return true
|
||||
}
|
||||
}
|
||||
time.Sleep(2 * time.Second)
|
||||
|
||||
req, err := http.NewRequest("GET", url, nil)
|
||||
x(err)
|
||||
req.Header.Set("User-Agent", agent)
|
||||
|
||||
client := &http.Client{}
|
||||
resp, err := client.Do(req)
|
||||
x(err)
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
x(err)
|
||||
x(resp.Body.Close())
|
||||
|
||||
/*
|
||||
s := string(body)
|
||||
ok := true
|
||||
i1 := strings.Index(s, `type="application/ld+json"`)
|
||||
if i1 < 0 {
|
||||
ok = false
|
||||
} else {
|
||||
i1 += strings.Index(s[i1:], `>`) + 1
|
||||
i2 := i1 + strings.Index(s[i1:], `</script>`)
|
||||
if i2 < i1 {
|
||||
ok = false
|
||||
} else {
|
||||
s = s[i1:i2]
|
||||
}
|
||||
}
|
||||
if !ok {
|
||||
_ = w(fmt.Errorf("script jsonld not found: %s", url))
|
||||
|
||||
fp, err := os.Create(filename + ".err")
|
||||
x(err)
|
||||
_, err = fmt.Fprintf(fp, "script jsonld not found: %s\n", url)
|
||||
x(err)
|
||||
x(fp.Close())
|
||||
x(os.Chtimes(filename+".err", timestamp, timestamp))
|
||||
|
||||
fp, err = os.Create(filename + ".html")
|
||||
x(err)
|
||||
_, err = fp.Write(body)
|
||||
x(err)
|
||||
x(fp.Close())
|
||||
x(os.Chtimes(filename+".html", timestamp, timestamp))
|
||||
|
||||
return false
|
||||
}
|
||||
fp, err := os.Create(filename + ".json")
|
||||
x(err)
|
||||
_, err = fp.WriteString(s)
|
||||
x(err)
|
||||
x(fp.Close())
|
||||
x(os.Chtimes(filename+".json", timestamp, timestamp))
|
||||
*/
|
||||
|
||||
var buf bytes.Buffer
|
||||
|
||||
doc, err := gokogiri.ParseHtml(body)
|
||||
x(err)
|
||||
|
||||
root := doc.Root()
|
||||
|
||||
lnn, err := root.Search(`//head/link[@rel="canonical"]/@href`)
|
||||
x(err)
|
||||
for _, ln := range lnn {
|
||||
if strings.Contains(ln.String(), "/liveblog/") {
|
||||
fp, err := os.Create(filename + ".skip")
|
||||
x(err)
|
||||
_, err = fp.WriteString("liveblog\n")
|
||||
x(err)
|
||||
x(fp.Close())
|
||||
x(os.Chtimes(filename+".skip", timestamp, timestamp))
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
if len(tags) == 0 {
|
||||
_, err = fmt.Fprintln(&buf, "##META text cat =")
|
||||
x(err)
|
||||
} else {
|
||||
for _, tag := range tags {
|
||||
_, err = fmt.Fprintf(&buf, "##META text cat = %s\n", tag)
|
||||
x(err)
|
||||
}
|
||||
}
|
||||
if len(labels) == 0 {
|
||||
_, err = fmt.Fprintln(&buf, "##META text label =")
|
||||
x(err)
|
||||
} else {
|
||||
for _, label := range labels {
|
||||
_, err = fmt.Fprintf(&buf, "##META text label = %s\n", label)
|
||||
x(err)
|
||||
}
|
||||
}
|
||||
|
||||
_, err = buf.WriteString(clean(title))
|
||||
x(err)
|
||||
|
||||
fouten := make([]string, 0)
|
||||
|
||||
found := false
|
||||
pp, err := root.Search(`//div[@data-sentry-component="ArticleHeading"]//p[contains(@class,"prose-article-body-r")]`)
|
||||
x(err)
|
||||
for _, p := range pp {
|
||||
_, err = fmt.Fprint(&buf, clean(p.Content()))
|
||||
x(err)
|
||||
found = true
|
||||
}
|
||||
if !found {
|
||||
fouten = append(fouten, fmt.Sprintf("no heading: %s", url))
|
||||
_ = w(fmt.Errorf("no heading: %s", url))
|
||||
}
|
||||
|
||||
found = false
|
||||
pp, err = root.Search(
|
||||
`//div[@data-sentry-component="ArticleText"]//p[contains(@class,"prose-article-body-r")]` +
|
||||
` | ` +
|
||||
`//div[@data-sentry-component="ArticleTitle"]//h2`)
|
||||
x(err)
|
||||
for _, p := range pp {
|
||||
_, err = fmt.Fprint(&buf, clean(p.Content()))
|
||||
x(err)
|
||||
found = true
|
||||
}
|
||||
if !found {
|
||||
fouten = append(fouten, fmt.Sprintf("no text: %s", url))
|
||||
_ = w(fmt.Errorf("no text: %s", url))
|
||||
}
|
||||
|
||||
if len(fouten) > 0 {
|
||||
fp, err := os.Create(filename + ".err")
|
||||
x(err)
|
||||
for _, fout := range fouten {
|
||||
_, err = fp.WriteString(fout)
|
||||
x(err)
|
||||
}
|
||||
x(err)
|
||||
x(fp.Close())
|
||||
x(os.Chtimes(filename+".err", timestamp, timestamp))
|
||||
|
||||
fp, err = os.Create(filename + ".html")
|
||||
x(err)
|
||||
_, err = fp.Write(body)
|
||||
x(err)
|
||||
x(fp.Close())
|
||||
x(os.Chtimes(filename+".html", timestamp, timestamp))
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
fp, err := os.Create(filename + ".txt")
|
||||
x(err)
|
||||
_, err = fp.Write(buf.Bytes())
|
||||
x(err)
|
||||
x(fp.Close())
|
||||
x(os.Chtimes(filename+".txt", timestamp, timestamp))
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
func clean(s string) string {
|
||||
s = strings.Join(strings.Fields(s), " ")
|
||||
n := len(s)
|
||||
if n == 0 {
|
||||
return ""
|
||||
}
|
||||
if n > 0 {
|
||||
if strings.ContainsAny(s[n-1:], ".!?") {
|
||||
return s + "\n"
|
||||
}
|
||||
}
|
||||
if n > 1 {
|
||||
s2 := s[n-2:]
|
||||
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
|
||||
return s + "\n"
|
||||
}
|
||||
}
|
||||
return s + ".\n"
|
||||
}
|
||||
63
VRT/txt2corpus.sh
Executable file
63
VRT/txt2corpus.sh
Executable file
@@ -0,0 +1,63 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
unset CDPATH
|
||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
||||
export TZ=Europe/Amsterdam
|
||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||
|
||||
if [ "$1" = "" ]
|
||||
then
|
||||
ds=`ISODate -7`
|
||||
else
|
||||
case "$1" in
|
||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
||||
ds=$1
|
||||
;;
|
||||
*)
|
||||
echo INVALID
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
fi
|
||||
|
||||
dp=${ds//-//}
|
||||
|
||||
corpus=/net/corpora/nlnieuws/VRT/corpus/$ds
|
||||
|
||||
cd /net/corpora/nlnieuws/VRT/$dp
|
||||
|
||||
ln -s lock.$$ lock
|
||||
if [ "`readlink lock`" != lock.$$ ]
|
||||
then
|
||||
echo Getting lock failed
|
||||
exit 1
|
||||
fi
|
||||
|
||||
rm -fr out
|
||||
mkdir out
|
||||
|
||||
rm -f $corpus.lines
|
||||
for i in *.txt
|
||||
do
|
||||
b=`basename $i .txt`
|
||||
perl -p -e 's/^\s*//; s/^##META.*\n//' $i | tokenize.sh \
|
||||
| perl -e '$n = 0; while(<>) { $n++; print("vrt.'$b'.$n|$_"); }' \
|
||||
>> $corpus.lines
|
||||
done
|
||||
|
||||
cd out
|
||||
mkdir xml
|
||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||
|
||||
../../../metadata 2> err
|
||||
rm err
|
||||
|
||||
cd xml
|
||||
alto -o $corpus.data.dz *.xml 2> /dev/null
|
||||
|
||||
cd ../..
|
||||
rm -fr out
|
||||
|
||||
rm -f lock
|
||||
48
cmd/ISOWeek/ISOWeek.go
Normal file
48
cmd/ISOWeek/ISOWeek.go
Normal file
@@ -0,0 +1,48 @@
|
||||
package main
|
||||
|
||||
/*
|
||||
|
||||
Waarom?
|
||||
|
||||
We willen year-week, bijvoorbeeld 2025-52
|
||||
|
||||
Als de datum 1 januari 2027 is, dan geeft dit:
|
||||
|
||||
date +%Y-%V
|
||||
|
||||
... dit:
|
||||
|
||||
2027-53
|
||||
|
||||
Dat is fout. Het moet zijn:
|
||||
|
||||
2026-53
|
||||
|
||||
Dit programma geeft wel de juiste uitvoer.
|
||||
|
||||
*/
|
||||
|
||||
import (
|
||||
"github.com/pebbe/util"
|
||||
|
||||
"fmt"
|
||||
"os"
|
||||
"strconv"
|
||||
"time"
|
||||
)
|
||||
|
||||
var (
|
||||
x = util.CheckErr
|
||||
)
|
||||
|
||||
func main() {
|
||||
// arg 1: aantal dagen opgeteld bij huidige datum
|
||||
d, err := strconv.Atoi(os.Args[1])
|
||||
x(err)
|
||||
|
||||
location, err := time.LoadLocation("Europe/Amsterdam")
|
||||
x(err)
|
||||
|
||||
year, week := time.Now().AddDate(0, 0, d).In(location).ISOWeek()
|
||||
fmt.Printf("%d-%02d\n", year, week)
|
||||
}
|
||||
8
go.mod
Normal file
8
go.mod
Normal file
@@ -0,0 +1,8 @@
|
||||
module nlnieuws
|
||||
|
||||
go 1.25.0
|
||||
|
||||
require (
|
||||
github.com/jbowtie/gokogiri v0.0.0-20250107075044-de0f9d4877a5
|
||||
github.com/pebbe/util v0.9.0
|
||||
)
|
||||
4
go.sum
Normal file
4
go.sum
Normal file
@@ -0,0 +1,4 @@
|
||||
github.com/jbowtie/gokogiri v0.0.0-20250107075044-de0f9d4877a5 h1:tQbR4RKFBFi0+Ll69dXejKKUbQVNaOAT2fjlDvSAfx4=
|
||||
github.com/jbowtie/gokogiri v0.0.0-20250107075044-de0f9d4877a5/go.mod h1:kQE2lxPgVKe0JsBZMFFfMm5zBDCuRhaHFKOBzZeCLiw=
|
||||
github.com/pebbe/util v0.9.0 h1:PMZd+CpWb8GbWEmFGlL3qd6XPuywl6xFIbrXWi870OA=
|
||||
github.com/pebbe/util v0.9.0/go.mod h1:ynWl/SFX4+Seb9fpjVlYevr1f4TP7FrCmyZHiBCg69Q=
|
||||
Reference in New Issue
Block a user