commit 36f051a8a97aa4452d56458ac332ee0f4fc28f0c Author: Peter Kleiweg Date: Mon Mar 2 15:34:37 2026 +0100 first commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..16a96c1 --- /dev/null +++ b/.gitignore @@ -0,0 +1,29 @@ +/Amsterdam +/AT5/at5 +/AT5/metadata +/AT5/xml2txt +/GG/gg +/GG/metadata +/NieuwsNL/metadata +/NieuwsNL/nieuwsnl +/NOS/json2txt +/NOS/metadata +/NOS/nos +/NU/metadata +/NU/nu +/RO/metadata +/RO/ro +/RO/xml2txt +/Sargasso/metadata +/Sargasso/sargasso +/Sargasso/xml2txt +/Sikkom/metadata +/Sikkom/sikkom +/Tzum/metadata +/Tzum/tzum +/Tzum/xml2txt +/VRT/metadata +/VRT/vrt +/bin/ISOWeek +20?? +corpus diff --git a/AT5/Makefile b/AT5/Makefile new file mode 100644 index 0000000..63bab1b --- /dev/null +++ b/AT5/Makefile @@ -0,0 +1,13 @@ +all: \ + xml2txt \ + metadata \ + at5 + +xml2txt: cmd/xml2txt/*.go + go build -o $@ $^ + +metadata: cmd/metadata/*.go + go build -o $@ $^ + +at5: cmd/at5/*.go + go build -o $@ $^ diff --git a/AT5/cmd/at5/at5.go b/AT5/cmd/at5/at5.go new file mode 100644 index 0000000..841cdc7 --- /dev/null +++ b/AT5/cmd/at5/at5.go @@ -0,0 +1,81 @@ +package main + +import ( + "github.com/pebbe/util" + + "encoding/xml" + "fmt" + "io" + "net/http" + "net/url" + "os" + "strings" + "time" +) + +type Rss struct { + XMLName xml.Name `xml:"rss"` + Items []ItemT `xml:"channel>item"` +} + +type ItemT struct { + PubDate string `xml:"pubDate"` + UnixTime int64 `xml:"unixTime"` + Guid string `xml:"guid"` + Data []byte `xml:",innerxml"` +} + +var ( + x = util.CheckErr + agent = "AhrefsBot/7.0" +) + +func main() { + req, err := http.NewRequest("GET", "https://rss.at5.nl/rss", nil) + x(err) + req.Header.Set("User-Agent", agent) + + client := &http.Client{} + resp, err := client.Do(req) + x(err) + body, err := io.ReadAll(resp.Body) + x(err) + x(resp.Body.Close()) + + var rss Rss + x(xml.Unmarshal(body, &rss)) + + if len(rss.Items) == 0 { + x(fmt.Errorf("len(rss.Items) == 0")) + } + + for _, item := range rss.Items { + t, err := time.Parse(time.RFC1123Z, item.PubDate) + if err != nil { + t, err = time.Parse(time.RFC1123, item.PubDate) + } + x(err) + year, week := t.ISOWeek() + dirname := fmt.Sprintf("/net/corpora/nlnieuws/AT5/%d/%02d", year, week) + basename := strings.TrimPrefix(item.Guid, "https://www.at5.nl/artikelen/") + if i := strings.LastIndex(basename, "/"); i > 0 { + basename = basename[:i] + } + filename := dirname + "/" + url.PathEscape(basename) + + x(os.MkdirAll(dirname, 0777)) + fp, err := os.Create(filename + ".xml") + x(err) + _, err = fp.WriteString("\n\n") + x(err) + _, err = fmt.Fprintf(fp, "%d", t.Unix()) + x(err) + _, err = fp.Write(item.Data) + x(err) + _, err = fp.WriteString("\n") + x(err) + x(fp.Close()) + x(os.Chtimes(filename+".xml", t, t)) + } + +} diff --git a/AT5/cmd/metadata/metadata.go b/AT5/cmd/metadata/metadata.go new file mode 100644 index 0000000..ad19fb1 --- /dev/null +++ b/AT5/cmd/metadata/metadata.go @@ -0,0 +1,95 @@ +package main + +import ( + "github.com/pebbe/util" + + "encoding/xml" + "fmt" + "os" + "strings" + "time" +) + +type Item struct { + XMLName xml.Name `xml:"item"` + UnixTime int64 `xml:"unixTime"` +} + +var ( + x = util.CheckErr + data = make(map[string][]string) + location *time.Location +) + +func main() { + var err error + location, err = time.LoadLocation("Europe/Amsterdam") + x(err) + + files, err := os.ReadDir("..") + x(err) + for _, file := range files { + filename := file.Name() + if strings.HasSuffix(filename, ".xml") { + doXml("../", filename) + } + } + + files, err = os.ReadDir("xml") + x(err) + for _, file := range files { + filename := file.Name() + if !strings.HasSuffix(filename, ".xml") { + continue + } + aa := strings.Split(filename, ".") + base := strings.Join(aa[1:len(aa)-2], ".") + b, err := os.ReadFile("xml/" + filename) + x(err) + s := string(b) + i := strings.Index(s, "\n \n") + x(err) + for _, m := range data[base] { + _, err = fp.WriteString(" " + m + "\n") + x(err) + } + _, err = fp.WriteString(" \n ") + x(err) + _, err = fp.WriteString(stripMeta(s[i:])) + x(err) + x(fp.Close()) + x(os.Rename("xml/"+filename+".tmp", "xml/"+filename)) + } +} + +func doXml(dirname, filename string) { + base := filename[:len(filename)-4] + if _, ok := data[base]; !ok { + data[base] = make([]string, 0) + } + b, err := os.ReadFile(dirname + filename) + x(err) + var item Item + x(xml.Unmarshal(b, &item), filename) + t := time.Unix(item.UnixTime, 0).In(location) + data[base] = append(data[base], + fmt.Sprintf(``, + t.Year(), + int(t.Month()), + t.Day())) +} + +func stripMeta(s string) string { + i1 := strings.Index(s, "") + if i1 < 0 { + return s + } + i2 := i1 + strings.Index(s[i1:], "") + 11 + return s[:i1] + strings.TrimLeft(s[i2:], " \t\r\n") +} diff --git a/AT5/cmd/xml2txt/xml2txt.go b/AT5/cmd/xml2txt/xml2txt.go new file mode 100644 index 0000000..9b4a6fa --- /dev/null +++ b/AT5/cmd/xml2txt/xml2txt.go @@ -0,0 +1,92 @@ +package main + +import ( + "github.com/jbowtie/gokogiri" + "github.com/pebbe/util" + + "encoding/xml" + "fmt" + "os" + "regexp" + "strings" + "time" +) + +type Item struct { + Title string `xml:"title"` + Text string `xml:"description"` +} + +var ( + x = util.CheckErr + + reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`) +) + +func main() { + + var ds string + switch len(os.Args) { + case 1: + year, week := time.Now().AddDate(0, 0, -7).ISOWeek() + ds = fmt.Sprintf("%d-%02d", year, week) + case 2: + if !reYearWeek.MatchString(os.Args[1]) { + x(fmt.Errorf("arg must be yyyy-ww")) + } + ds = os.Args[1] + default: + x(fmt.Errorf("too many arguments")) + } + dp := ds[:4] + "/" + ds[5:] + + x(os.Chdir("/net/corpora/nlnieuws/AT5/" + dp)) + x(os.MkdirAll("out", 0777)) + files, err := os.ReadDir(".") + x(err) + for _, file := range files { + filename := file.Name() + if !strings.HasSuffix(filename, ".xml") { + continue + } + b, err := os.ReadFile(filename) + x(err) + fp, err := os.Create("out/" + filename[:len(filename)-4] + ".txt") + x(err) + var item Item + x(xml.Unmarshal(b, &item), filename) + _, err = fp.WriteString(addEnd(item.Title)) + x(err) + doc, err := gokogiri.ParseHtml([]byte(`` + item.Text + ``)) + x(err) + root := doc.Root() + pp, err := root.Search(`//body/p | //body/h2`) + x(err) + for _, p := range pp { + _, err = fp.WriteString(addEnd(p.Content())) + x(err) + } + x(err) + x(fp.Close()) + } +} + +func addEnd(s string) string { + s = strings.TrimSpace(s) + n := len(s) + if n == 0 { + return "" + } + if n > 0 { + if strings.ContainsAny(s[n-1:], ".!?") { + return s + "\n" + } + } + if n > 1 { + s2 := s[n-2:] + if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` { + return s + "\n" + } + } + return s + ".\n" +} diff --git a/AT5/txt2corpus.sh b/AT5/txt2corpus.sh new file mode 100755 index 0000000..f0662fb --- /dev/null +++ b/AT5/txt2corpus.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +set -e + +unset CDPATH +PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH +export TZ=Europe/Amsterdam +. /net/aps/etc/alpino-activate.sh > /dev/null + +if [ "$1" = "" ] +then + ds=`ISODate -7` +else + case "$1" in + 2[0-9][0-9][0-9]-[0-5][0-9]) + ds=$1 + ;; + *) + echo INVALID + exit 1 + ;; + esac +fi + +dp=${ds//-//} + +corpus=/net/corpora/nlnieuws/AT5/corpus/$ds + +cd /net/corpora/nlnieuws/AT5/$dp + +ln -s lock.$$ lock +if [ "`readlink lock`" != lock.$$ ] +then + echo Getting lock failed + exit 1 +fi + +rm -fr out +mkdir out + +../../xml2txt $ds + +rm -f $corpus.lines +for i in out/*.txt +do + b=`basename $i .txt` + perl -p -e 's/^\s*//; s/^##META.*\n//' $i | tokenize.sh \ + | perl -e '$n = 0; while(<>) { $n++; print("at5.'$b'.$n|$_"); }' \ + >> $corpus.lines +done + +cd out +mkdir xml +Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log + +../../../metadata 2> err +rm err + +cd xml +alto -o $corpus.data.dz *.xml 2> /dev/null + +cd ../.. +rm -fr out + +rm -f lock diff --git a/GG/Makefile b/GG/Makefile new file mode 100644 index 0000000..2bf1e11 --- /dev/null +++ b/GG/Makefile @@ -0,0 +1,9 @@ +all: \ + metadata \ + gg + +metadata: cmd/metadata/*.go + go build -o $@ $^ + +gg: cmd/gg/*.go + go build -o $@ $^ diff --git a/GG/cmd/gg/gg.go b/GG/cmd/gg/gg.go new file mode 100644 index 0000000..f0bf530 --- /dev/null +++ b/GG/cmd/gg/gg.go @@ -0,0 +1,195 @@ +package main + +import ( + "github.com/jbowtie/gokogiri" + "github.com/pebbe/util" + + "encoding/xml" + "fmt" + "io" + "net/http" + "net/url" + "os" + "strings" + "time" +) + +type Rss struct { + XMLName xml.Name `xml:"rss"` + Items []ItemT `xml:"channel>item"` +} + +type ItemT struct { + Title string `xml:"title"` + PubDate string `xml:"pubDate"` + UnixTime int64 `xml:"unixTime"` + Guid string `xml:"guid"` + Link string `xml:"link"` + Data []byte `xml:",innerxml"` +} + +var ( + x = util.CheckErr + w = util.WarnErr + // agent = "AhrefsBot/7.0" + agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36" +) + +func exists(filename string) bool { + _, err := os.Stat(filename) + return err == nil +} + +func fileDate(filename string) string { + b, err := os.ReadFile(filename) + if err != nil { + return "" + } + s := string(b) + i1 := strings.Index(s, "") + 10 + i2 := strings.Index(s, "") + return s[i1:i2] +} + +func main() { + req, err := http.NewRequest("GET", "https://gemeente.groningen.nl/feed/rss/nieuws", nil) + x(err) + req.Header.Set("User-Agent", agent) + + client := &http.Client{} + resp, err := client.Do(req) + x(err) + body, err := io.ReadAll(resp.Body) + x(err) + x(resp.Body.Close()) + + var rss Rss + x(xml.Unmarshal(body, &rss)) + + if len(rss.Items) == 0 { + x(fmt.Errorf("len(rss.Items) == 0")) + } + + for _, item := range rss.Items { + t, err := time.Parse(time.RFC1123Z, item.PubDate) + if err != nil { + t, err = time.Parse(time.RFC1123, item.PubDate) + } + x(err) + year, week := t.ISOWeek() + dirname := fmt.Sprintf("/net/corpora/nlnieuws/GG/%d/%02d", year, week) + filename := dirname + "/" + url.PathEscape(item.Guid) + + ts := fmt.Sprintf("%d", t.Unix()) + needUpdate := fileDate(filename+".xml") != ts + + x(os.MkdirAll(dirname, 0777)) + fp, err := os.Create(filename + ".xml") + x(err) + _, err = fp.WriteString("\n\n") + x(err) + _, err = fmt.Fprintf(fp, "%d", t.Unix()) + x(err) + _, err = fp.Write(item.Data) + x(err) + _, err = fp.WriteString("\n") + x(err) + x(fp.Close()) + x(os.Chtimes(filename+".xml", t, t)) + if !doArticle(filename, item.Link, item.Title, t, needUpdate) { + x(os.Remove(filename + ".xml")) + } + } +} + +func doArticle(filename string, url string, title string, timestamp time.Time, needUpdate bool) bool { + if needUpdate { + _ = os.Remove(filename + ".err") + _ = os.Remove(filename + ".html") + _ = os.Remove(filename + ".txt") + _ = os.Remove(filename + ".skip") + } else { + if exists(filename+".txt") || exists(filename+".skip") { + return true + } + } + time.Sleep(2 * time.Second) + + req, err := http.NewRequest("GET", url, nil) + x(err) + req.Header.Set("User-Agent", agent) + + client := &http.Client{} + resp, err := client.Do(req) + x(err) + body, err := io.ReadAll(resp.Body) + x(err) + x(resp.Body.Close()) + + doc, err := gokogiri.ParseHtml(body) + x(err) + + root := doc.Root() + + ell, err := root.Search( + `//div[contains(@class,"component-richtext")]/p` + + ` | ` + + `//div[contains(@class,"component-richtext")]/h2`) + x(err) + if len(ell) == 0 { + _ = w(fmt.Errorf("empty: %s", url)) + + fp, err := os.Create(filename + ".err") + x(err) + _, err = fmt.Fprintf(fp, "empty: %s\n", url) + x(err) + x(fp.Close()) + x(os.Chtimes(filename+".err", timestamp, timestamp)) + + fp, err = os.Create(filename + ".html") + x(err) + _, err = fp.Write(body) + x(err) + x(fp.Close()) + x(os.Chtimes(filename+".html", timestamp, timestamp)) + + return false + } + + fp, err := os.Create(filename + ".txt") + x(err) + + _, err = fp.WriteString(addEnd(title)) + x(err) + + for _, el := range ell { + _, err = fp.WriteString(addEnd(el.Content())) + x(err) + } + + x(fp.Close()) + + x(os.Chtimes(filename+".txt", timestamp, timestamp)) + + return true +} + +func addEnd(s string) string { + s = strings.TrimSpace(s) + n := len(s) + if n == 0 { + return "" + } + if n > 0 { + if strings.ContainsAny(s[n-1:], ".!?") { + return s + "\n" + } + } + if n > 1 { + s2 := s[n-2:] + if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` { + return s + "\n" + } + } + return s + ".\n" +} diff --git a/GG/cmd/metadata/metadata.go b/GG/cmd/metadata/metadata.go new file mode 100644 index 0000000..938cc7a --- /dev/null +++ b/GG/cmd/metadata/metadata.go @@ -0,0 +1,95 @@ +package main + +import ( + "github.com/pebbe/util" + + "encoding/xml" + "fmt" + "os" + "strings" + "time" +) + +type Item struct { + XMLName xml.Name `xml:"item"` + UnixTime int64 `xml:"unixTime"` +} + +var ( + x = util.CheckErr + data = make(map[string][]string) + location *time.Location +) + +func main() { + var err error + location, err = time.LoadLocation("Europe/Amsterdam") + x(err) + + files, err := os.ReadDir("..") + x(err) + for _, file := range files { + filename := file.Name() + if strings.HasSuffix(filename, ".xml") { + doXml(filename) + } + } + + files, err = os.ReadDir("xml") + x(err) + for _, file := range files { + filename := file.Name() + if !strings.HasSuffix(filename, ".xml") { + continue + } + aa := strings.Split(filename, ".") + base := strings.Join(aa[1:len(aa)-2], ".") + b, err := os.ReadFile("xml/" + filename) + x(err) + s := string(b) + i := strings.Index(s, "\n \n") + x(err) + for _, m := range data[base] { + _, err = fp.WriteString(" " + m + "\n") + x(err) + } + _, err = fp.WriteString(" \n ") + x(err) + _, err = fp.WriteString(stripMeta(s[i:])) + x(err) + x(fp.Close()) + x(os.Rename("xml/"+filename+".tmp", "xml/"+filename)) + } +} + +func doXml(filename string) { + base := filename[:len(filename)-4] + if _, ok := data[base]; !ok { + data[base] = make([]string, 0) + } + b, err := os.ReadFile("../" + filename) + x(err) + var item Item + x(xml.Unmarshal(b, &item)) + t := time.Unix(item.UnixTime, 0).In(location) + data[base] = append(data[base], + fmt.Sprintf(``, + t.Year(), + int(t.Month()), + t.Day())) +} + +func stripMeta(s string) string { + i1 := strings.Index(s, "") + if i1 < 0 { + return s + } + i2 := i1 + strings.Index(s[i1:], "") + 11 + return s[:i1] + strings.TrimLeft(s[i2:], " \t\r\n") +} diff --git a/GG/txt2corpus.sh b/GG/txt2corpus.sh new file mode 100755 index 0000000..354c18d --- /dev/null +++ b/GG/txt2corpus.sh @@ -0,0 +1,63 @@ +#!/bin/bash + +set -e + +unset CDPATH +PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH +export TZ=Europe/Amsterdam +. /net/aps/etc/alpino-activate.sh > /dev/null + +if [ "$1" = "" ] +then + ds=`ISODate -7` +else + case "$1" in + 2[0-9][0-9][0-9]-[0-5][0-9]) + ds=$1 + ;; + *) + echo INVALID + exit 1 + ;; + esac +fi + +dp=${ds//-//} + +corpus=/net/corpora/nlnieuws/GG/corpus/$ds + +cd /net/corpora/nlnieuws/GG/$dp + +ln -s lock.$$ lock +if [ "`readlink lock`" != lock.$$ ] +then + echo Getting lock failed + exit 1 +fi + +rm -fr out +mkdir out + +rm -f $corpus.lines +for i in *.txt +do + b=`basename $i .txt` + perl -p -e 's/^\s*//; s/^##META.*\n//' $i | tokenize.sh \ + | perl -e '$n = 0; while(<>) { $n++; print("gg.'$b'.$n|$_"); }' \ + >> $corpus.lines +done + +cd out +mkdir xml +Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log + +../../../metadata 2> err +rm err + +cd xml +alto -o $corpus.data.dz *.xml 2> /dev/null + +cd ../.. +rm -fr out + +rm -f lock diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..e09f7d8 --- /dev/null +++ b/Makefile @@ -0,0 +1,18 @@ + +all: + make -C AT5 + make -C GG + make -C NieuwsNL + make -C NOS + make -C NU + make -C RO + make -C Sargasso + make -C Sikkom + make -C Tzum + make -C VRT + make bin/ISOWeek + +bin/ISOWeek: cmd/ISOWeek/*.go + go build -o $@ $^ + + diff --git a/NOS/Makefile b/NOS/Makefile new file mode 100644 index 0000000..d66f8cf --- /dev/null +++ b/NOS/Makefile @@ -0,0 +1,13 @@ +all: \ + json2txt \ + metadata \ + nos + +json2txt: cmd/json2txt/*.go + go build -o $@ $^ + +metadata: cmd/metadata/*.go + go build -o $@ $^ + +nos: cmd/nos/*.go + go build -o $@ $^ diff --git a/NOS/cmd/json2txt/json2txt.go b/NOS/cmd/json2txt/json2txt.go new file mode 100644 index 0000000..cc51e9f --- /dev/null +++ b/NOS/cmd/json2txt/json2txt.go @@ -0,0 +1,93 @@ +package main + +import ( + "github.com/pebbe/util" + + "encoding/json" + "fmt" + "os" + "regexp" + "strings" + "time" +) + +type Item struct { + Title string `json:"name"` + Text string `json:"articleBody"` + Cats []string `json:"articleSection"` + Tags []string `json:"keywords"` +} + +var ( + x = util.CheckErr + + reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`) +) + +func main() { + + var ds string + switch len(os.Args) { + case 1: + year, week := time.Now().AddDate(0, 0, -7).ISOWeek() + ds = fmt.Sprintf("%d-%02d", year, week) + case 2: + if !reYearWeek.MatchString(os.Args[1]) { + x(fmt.Errorf("arg must be yyyy-ww")) + } + ds = os.Args[1] + default: + x(fmt.Errorf("too many arguments")) + } + dp := ds[:4] + "/" + ds[5:] + + x(os.Chdir("/net/corpora/nlnieuws/NOS/" + dp)) + x(os.MkdirAll("out", 0777)) + files, err := os.ReadDir(".") + x(err) + for _, file := range files { + filename := file.Name() + if !strings.HasSuffix(filename, ".json") { + continue + } + b, err := os.ReadFile(filename) + x(err) + fp, err := os.Create("out/" + filename[:len(filename)-5] + ".txt") + x(err) + var item Item + x(json.Unmarshal(b, &item)) + for _, cat := range item.Cats { + _, err = fmt.Fprintf(fp, "##META text cat = %s\n", cat) + x(err) + } + for _, cat := range item.Tags { + _, err = fmt.Fprintf(fp, "##META text tag = %s\n", cat) + x(err) + } + _, err = fp.WriteString(addEnd(item.Title)) + x(err) + _, err = fp.WriteString(item.Text) + x(err) + x(fp.Close()) + } +} + +func addEnd(s string) string { + s = strings.TrimSpace(s) + n := len(s) + if n == 0 { + return "" + } + if n > 0 { + if strings.ContainsAny(s[n-1:], ".!?") { + return s + "\n" + } + } + if n > 1 { + s2 := s[n-2:] + if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` { + return s + "\n" + } + } + return s + ".\n" +} diff --git a/NOS/cmd/metadata/metadata.go b/NOS/cmd/metadata/metadata.go new file mode 100644 index 0000000..1a7f802 --- /dev/null +++ b/NOS/cmd/metadata/metadata.go @@ -0,0 +1,136 @@ +package main + +import ( + "github.com/pebbe/util" + + "bufio" + "encoding/xml" + "fmt" + "html" + "os" + "strings" + "time" +) + +type Item struct { + XMLName xml.Name `xml:"item"` + UnixTime int64 `xml:"unixTime"` +} + +var ( + x = util.CheckErr + escape = html.EscapeString + data = make(map[string][]string) + location *time.Location +) + +func main() { + var err error + location, err = time.LoadLocation("Europe/Amsterdam") + x(err) + + files, err := os.ReadDir(".") + x(err) + for _, file := range files { + filename := file.Name() + if strings.HasSuffix(filename, ".txt") { + doText("", filename) + } else if strings.HasSuffix(filename, ".xml") { + doXml("", filename) + } + } + files, err = os.ReadDir("..") + x(err) + for _, file := range files { + filename := file.Name() + if strings.HasSuffix(filename, ".txt") { + doText("../", filename) + } else if strings.HasSuffix(filename, ".xml") { + doXml("../", filename) + } + } + + files, err = os.ReadDir("xml") + x(err) + for _, file := range files { + filename := file.Name() + if !strings.HasSuffix(filename, ".xml") { + continue + } + aa := strings.Split(filename, ".") + base := strings.Join(aa[1:len(aa)-2], ".") + b, err := os.ReadFile("xml/" + filename) + x(err) + s := string(b) + i := strings.Index(s, "\n \n") + x(err) + for _, m := range data[base] { + _, err = fp.WriteString(" " + m + "\n") + x(err) + } + _, err = fp.WriteString(" \n ") + x(err) + _, err = fp.WriteString(stripMeta(s[i:])) + x(err) + x(fp.Close()) + x(os.Rename("xml/"+filename+".tmp", "xml/"+filename)) + } +} + +func doText(dirname, filename string) { + base := filename[:len(filename)-4] + if _, ok := data[base]; !ok { + data[base] = make([]string, 0) + } + fp, err := os.Open(dirname + filename) + x(err) + defer func() { x(fp.Close()) }() + scanner := bufio.NewScanner(fp) + for scanner.Scan() { + line := scanner.Text() + if !strings.HasPrefix(line, "##META") { + continue + } + aa := strings.Fields(line) + if len(aa) > 4 { + data[base] = append(data[base], + fmt.Sprintf(``, + aa[1], + escape(aa[2]), + escape(strings.Join(aa[4:], " ")))) + } + } + x(scanner.Err()) +} + +func doXml(dirname, filename string) { + base := filename[:len(filename)-4] + if _, ok := data[base]; !ok { + data[base] = make([]string, 0) + } + b, err := os.ReadFile(dirname + filename) + x(err) + var item Item + x(xml.Unmarshal(b, &item)) + t := time.Unix(item.UnixTime, 0).In(location) + data[base] = append(data[base], + fmt.Sprintf(``, + t.Year(), + int(t.Month()), + t.Day())) +} + +func stripMeta(s string) string { + i1 := strings.Index(s, "") + if i1 < 0 { + return s + } + i2 := i1 + strings.Index(s[i1:], "") + 11 + return s[:i1] + strings.TrimLeft(s[i2:], " \t\r\n") +} diff --git a/NOS/cmd/nos/nos.go b/NOS/cmd/nos/nos.go new file mode 100644 index 0000000..87cd7be --- /dev/null +++ b/NOS/cmd/nos/nos.go @@ -0,0 +1,170 @@ +package main + +import ( + "github.com/pebbe/util" + + "encoding/xml" + "fmt" + "io" + "net/http" + "net/url" + "os" + "strings" + "time" +) + +type Rss struct { + XMLName xml.Name `xml:"rss"` + Items []ItemT `xml:"channel>item"` +} + +type ItemT struct { + PubDate string `xml:"pubDate"` + UnixTime int64 `xml:"unixTime"` + Guid string `xml:"guid"` + Link string `xml:"link"` + Data []byte `xml:",innerxml"` +} + +var ( + x = util.CheckErr + w = util.WarnErr + agent = "AhrefsBot/7.0" +) + +func exists(filename string) bool { + _, err := os.Stat(filename) + return err == nil +} + +func fileDate(filename string) string { + b, err := os.ReadFile(filename) + if err != nil { + return "" + } + s := string(b) + i1 := strings.Index(s, "") + 10 + i2 := strings.Index(s, "") + return s[i1:i2] +} + +func main() { + resp, err := http.Get("https://feeds.nos.nl/nosnieuwsalgemeen") + x(err) + body, err := io.ReadAll(resp.Body) + x(err) + x(resp.Body.Close()) + + var rss Rss + x(xml.Unmarshal(body, &rss)) + + if len(rss.Items) == 0 { + x(fmt.Errorf("len(rss.Items) == 0")) + } + + for _, item := range rss.Items { + var t time.Time + for _, format := range []string{ + "Mon, 2 Jan 2006 15:04:05 -0700", + "Mon, 2 Jan 2006 15:04:05 MST", + time.RFC1123, + time.RFC1123Z} { + t, err = time.Parse(format, item.PubDate) + if err == nil { + break + } + } + x(err) + year, week := t.ISOWeek() + dirname := fmt.Sprintf("/net/corpora/nlnieuws/NOS/%d/%02d", year, week) + filename := dirname + "/" + url.PathEscape(strings.TrimPrefix(item.Guid, "https://nos.nl/l/")) + + ts := fmt.Sprintf("%d", t.Unix()) + needUpdate := fileDate(filename+".xml") != ts + + x(os.MkdirAll(dirname, 0777)) + fp, err := os.Create(filename + ".xml") + x(err) + _, err = fp.WriteString("\n\n") + x(err) + _, err = fmt.Fprintf(fp, "%d", t.Unix()) + x(err) + _, err = fp.Write(item.Data) + x(err) + _, err = fp.WriteString("\n") + x(err) + x(fp.Close()) + x(os.Chtimes(filename+".xml", t, t)) + if !doArticle(filename, item.Link, t, needUpdate) { + x(os.Remove(filename + ".xml")) + } + } +} + +func doArticle(filename string, url string, timestamp time.Time, needUpdate bool) bool { + if needUpdate { + _ = os.Remove(filename + ".err") + _ = os.Remove(filename + ".html") + _ = os.Remove(filename + ".skip") + _ = os.Remove(filename + ".json") + } else { + if exists(filename+".json") || exists(filename+".skip") { + return true + } + } + time.Sleep(2 * time.Second) + + req, err := http.NewRequest("GET", url, nil) + x(err) + req.Header.Set("User-Agent", agent) + + client := &http.Client{} + resp, err := client.Do(req) + x(err) + body, err := io.ReadAll(resp.Body) + x(err) + x(resp.Body.Close()) + + s := string(body) + + ok := true + i1 := strings.Index(s, ``) + if i2 < i1 { + ok = false + } else { + s = s[i1:i2] + } + } + if !ok { + _ = w(fmt.Errorf("script jsonld not found: %s", url)) + + fp, err := os.Create(filename + ".err") + x(err) + _, err = fmt.Fprintf(fp, "script jsonld not found: %s\n", url) + x(err) + x(fp.Close()) + x(os.Chtimes(filename+".err", timestamp, timestamp)) + + fp, err = os.Create(filename + ".html") + x(err) + _, err = fp.Write(body) + x(err) + x(fp.Close()) + x(os.Chtimes(filename+".html", timestamp, timestamp)) + + return false + } + + fp, err := os.Create(filename + ".json") + x(err) + _, err = fp.WriteString(s) + x(err) + x(fp.Close()) + x(os.Chtimes(filename+".json", timestamp, timestamp)) + return true +} diff --git a/NOS/txt2corpus.sh b/NOS/txt2corpus.sh new file mode 100755 index 0000000..1735991 --- /dev/null +++ b/NOS/txt2corpus.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +set -e + +unset CDPATH +PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH +export TZ=Europe/Amsterdam +. /net/aps/etc/alpino-activate.sh > /dev/null + +if [ "$1" = "" ] +then + ds=`ISODate -7` +else + case "$1" in + 2[0-9][0-9][0-9]-[0-5][0-9]) + ds=$1 + ;; + *) + echo INVALID + exit 1 + ;; + esac +fi + +dp=${ds//-//} + +corpus=/net/corpora/nlnieuws/NOS/corpus/$ds + +cd /net/corpora/nlnieuws/NOS/$dp + +ln -s lock.$$ lock +if [ "`readlink lock`" != lock.$$ ] +then + echo Getting lock failed + exit 1 +fi + +rm -fr out +mkdir out + +../../json2txt $ds + +rm -f $corpus.lines +for i in out/*.txt +do + b=`basename $i .txt` + perl -p -e 's/^\s*//; s/^##META.*\n//' $i | tokenize.sh \ + | perl -e '$n = 0; while(<>) { $n++; print("nos.'$b'.$n|$_"); }' \ + >> $corpus.lines +done + +cd out +mkdir xml +Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log + +../../../metadata 2> err +rm err + +cd xml +alto -o $corpus.data.dz *.xml 2> /dev/null + +cd ../.. +rm -fr out + +rm -f lock diff --git a/NU/Makefile b/NU/Makefile new file mode 100644 index 0000000..2f5ecfa --- /dev/null +++ b/NU/Makefile @@ -0,0 +1,9 @@ +all: \ + metadata \ + nu + +metadata: cmd/metadata/*.go + go build -o $@ $^ + +nu: cmd/nu/*.go + go build -o $@ $^ diff --git a/NU/cmd/metadata/metadata.go b/NU/cmd/metadata/metadata.go new file mode 100644 index 0000000..7c5b52b --- /dev/null +++ b/NU/cmd/metadata/metadata.go @@ -0,0 +1,126 @@ +package main + +import ( + "github.com/pebbe/util" + + "bufio" + "encoding/xml" + "fmt" + "html" + "os" + "strings" + "time" +) + +type Item struct { + XMLName xml.Name `xml:"item"` + UnixTime int64 `xml:"unixTime"` +} + +var ( + x = util.CheckErr + escape = html.EscapeString + data = make(map[string][]string) + location *time.Location +) + +func main() { + var err error + location, err = time.LoadLocation("Europe/Amsterdam") + x(err) + + files, err := os.ReadDir("..") + x(err) + for _, file := range files { + filename := file.Name() + if strings.HasSuffix(filename, ".txt") { + doText(filename) + } else if strings.HasSuffix(filename, ".xml") { + doXml(filename) + } + } + + files, err = os.ReadDir("xml") + x(err) + for _, file := range files { + filename := file.Name() + if !strings.HasSuffix(filename, ".xml") { + continue + } + aa := strings.Split(filename, ".") + base := strings.Join(aa[1:len(aa)-2], ".") + b, err := os.ReadFile("xml/" + filename) + x(err) + s := string(b) + i := strings.Index(s, "\n \n") + x(err) + for _, m := range data[base] { + _, err = fp.WriteString(" " + m + "\n") + x(err) + } + _, err = fp.WriteString(" \n ") + x(err) + _, err = fp.WriteString(stripMeta(s[i:])) + x(err) + x(fp.Close()) + x(os.Rename("xml/"+filename+".tmp", "xml/"+filename)) + } +} + +func doText(filename string) { + base := filename[:len(filename)-4] + if _, ok := data[base]; !ok { + data[base] = make([]string, 0) + } + fp, err := os.Open("../" + filename) + x(err) + defer func() { x(fp.Close()) }() + scanner := bufio.NewScanner(fp) + for scanner.Scan() { + line := scanner.Text() + if !strings.HasPrefix(line, "##META") { + continue + } + aa := strings.Fields(line) + if len(aa) > 4 { + data[base] = append(data[base], + fmt.Sprintf(``, + aa[1], + escape(aa[2]), + escape(strings.Join(aa[4:], " ")))) + } + } + x(scanner.Err()) +} + +func doXml(filename string) { + base := filename[:len(filename)-4] + if _, ok := data[base]; !ok { + data[base] = make([]string, 0) + } + b, err := os.ReadFile("../" + filename) + x(err) + var item Item + x(xml.Unmarshal(b, &item)) + t := time.Unix(item.UnixTime, 0).In(location) + data[base] = append(data[base], + fmt.Sprintf(``, + t.Year(), + int(t.Month()), + t.Day())) +} + +func stripMeta(s string) string { + i1 := strings.Index(s, "") + if i1 < 0 { + return s + } + i2 := i1 + strings.Index(s[i1:], "") + 11 + return s[:i1] + strings.TrimLeft(s[i2:], " \t\r\n") +} diff --git a/NU/cmd/nu/nu.go b/NU/cmd/nu/nu.go new file mode 100644 index 0000000..956ba7c --- /dev/null +++ b/NU/cmd/nu/nu.go @@ -0,0 +1,240 @@ +package main + +import ( + "github.com/pebbe/util" + + "bytes" + "encoding/json" + "encoding/xml" + "fmt" + "html" + "io" + "net/http" + "net/url" + "os" + "strings" + "time" +) + +type Rss struct { + XMLName xml.Name `xml:"rss"` + Items []ItemT `xml:"channel>item"` +} + +type ItemT struct { + PubDate string `xml:"pubDate"` + UnixTime int64 `xml:"unixTime"` + Guid string `xml:"guid"` + Link string `xml:"link"` + Data []byte `xml:",innerxml"` +} + +type Doc struct { + Graph []GItem `json:"@graph"` +} + +type GItem struct { + ArticleBody string `json:"articleBody"` + ArticleSection []string `json:"articleSection"` +} + +var ( + x = util.CheckErr + w = util.WarnErr + agent = "AhrefsBot/7.0" +) + +func exists(filename string) bool { + _, err := os.Stat(filename) + return err == nil +} + +func fileDate(filename string) string { + b, err := os.ReadFile(filename) + if err != nil { + return "" + } + s := string(b) + i1 := strings.Index(s, "") + 10 + i2 := strings.Index(s, "") + return s[i1:i2] +} + +func main() { + resp, err := http.Get("https://www.nu.nl/rss") + x(err) + body, err := io.ReadAll(resp.Body) + x(err) + x(resp.Body.Close()) + + var rss Rss + x(xml.Unmarshal(body, &rss)) + + if len(rss.Items) == 0 { + x(fmt.Errorf("len(rss.Items) == 0")) + } + + for _, item := range rss.Items { + t, err := time.Parse(time.RFC1123Z, item.PubDate) + if err != nil { + t, err = time.Parse(time.RFC1123, item.PubDate) + } + x(err) + year, week := t.ISOWeek() + dirname := fmt.Sprintf("/net/corpora/nlnieuws/NU/%d/%02d", year, week) + filename := dirname + "/" + url.PathEscape(item.Guid) + + ts := fmt.Sprintf("%d", t.Unix()) + needUpdate := fileDate(filename+".xml") != ts + + x(os.MkdirAll(dirname, 0777)) + fp, err := os.Create(filename + ".xml") + x(err) + _, err = fp.WriteString("\n\n") + x(err) + _, err = fmt.Fprintf(fp, "%d", t.Unix()) + x(err) + _, err = fp.Write(item.Data) + x(err) + _, err = fp.WriteString("\n") + x(err) + x(fp.Close()) + x(os.Chtimes(filename+".xml", t, t)) + if !doArticle(filename, item.Link, t, needUpdate) { + x(os.Remove(filename + ".xml")) + } + } +} + +func doArticle(filename string, url string, timestamp time.Time, needUpdate bool) bool { + if needUpdate { + _ = os.Remove(filename + ".err") + _ = os.Remove(filename + ".html") + _ = os.Remove(filename + ".json") + _ = os.Remove(filename + ".txt") + _ = os.Remove(filename + ".skip") + } else { + // voor sommige berichten is geen .txt, alleen .json + if exists(filename+".json") || exists(filename+".skip") { + return true + } + } + + time.Sleep(2 * time.Second) + + req, err := http.NewRequest("GET", url, nil) + x(err) + req.Header.Set("User-Agent", agent) + + client := &http.Client{} + resp, err := client.Do(req) + x(err) + body, err := io.ReadAll(resp.Body) + x(err) + x(resp.Body.Close()) + + s := string(body) + ok := true + i1 := strings.Index(s, ``) + if i2 < i1 { + ok = false + } else { + s = s[i1:i2] + } + } + if !ok { + _ = w(fmt.Errorf("script jsonld not found: %s", url)) + + fp, err := os.Create(filename + ".err") + x(err) + _, err = fmt.Fprintf(fp, "script jsonld not found: %s\n", url) + x(err) + x(fp.Close()) + x(os.Chtimes(filename+".err", timestamp, timestamp)) + + fp, err = os.Create(filename + ".html") + x(err) + _, err = fp.Write(body) + x(err) + x(fp.Close()) + x(os.Chtimes(filename+".html", timestamp, timestamp)) + + return false + } + fp, err := os.Create(filename + ".json") + x(err) + _, err = fp.WriteString(s) + x(err) + x(fp.Close()) + x(os.Chtimes(filename+".json", timestamp, timestamp)) + + var doc Doc + if err = json.Unmarshal([]byte(s), &doc); err != nil { + _ = w(err, url) + + fp, err := os.Create(filename + ".err") + x(err) + _, err = fmt.Fprintf(fp, "%s: %v\n", url, err) + x(err) + x(fp.Close()) + x(os.Chtimes(filename+".err", timestamp, timestamp)) + + return false + } + + cats := make([]string, 0) + var buffer bytes.Buffer + for _, i := range doc.Graph { + _, err = buffer.WriteString(html.UnescapeString(i.ArticleBody)) + x(err) + cats = append(cats, i.ArticleSection...) + } + text := buffer.String() + + // sommige berichten bevatten geen tekst, maar een video bijvoorbeeld + // dit is geen fout + if len(text) > 0 { + + // text bevat kopjes zonder punt aan het eind + lines := strings.Split(text, "\n") + for i, line := range lines { + n := len(line) + if n > 0 { + if strings.ContainsAny(line[n-1:], ".!?") { + continue + } + } + if n > 1 { + s := line[n-2:] + if s == `."` || s == `!"` || s == `?"` { + continue + } + } + lines[i] = line + "." + } + text = strings.Join(lines, "\n") + "\n" + + fp, err := os.Create(filename + ".txt") + x(err) + if len(cats) == 0 { + _, err := fmt.Fprintln(fp, "##META text cat =") + x(err) + } else { + for _, cat := range cats { + _, err := fmt.Fprintf(fp, "##META text cat = %s\n", cat) + x(err) + } + } + _, err = fp.WriteString(text) + x(err) + x(fp.Close()) + x(os.Chtimes(filename+".txt", timestamp, timestamp)) + } + + return true +} diff --git a/NU/txt2corpus.sh b/NU/txt2corpus.sh new file mode 100755 index 0000000..eec98e0 --- /dev/null +++ b/NU/txt2corpus.sh @@ -0,0 +1,63 @@ +#!/bin/bash + +set -e + +unset CDPATH +PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH +export TZ=Europe/Amsterdam +. /net/aps/etc/alpino-activate.sh > /dev/null + +if [ "$1" = "" ] +then + ds=`ISODate -7` +else + case "$1" in + 2[0-9][0-9][0-9]-[0-5][0-9]) + ds=$1 + ;; + *) + echo INVALID + exit 1 + ;; + esac +fi + +dp=${ds//-//} + +corpus=/net/corpora/nlnieuws/NU/corpus/$ds + +cd /net/corpora/nlnieuws/NU/$dp + +ln -s lock.$$ lock +if [ "`readlink lock`" != lock.$$ ] +then + echo Getting lock failed + exit 1 +fi + +rm -fr out +mkdir out + +rm -f $corpus.lines +for i in *.txt +do + b=`basename $i .txt` + perl -p -e 's/^\s*//; s/^##META.*\n//' $i | tokenize.sh \ + | perl -e '$n = 0; while(<>) { $n++; print("nu.'$b'.$n|$_"); }' \ + >> $corpus.lines +done + +cd out +mkdir xml +Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log + +../../../metadata 2> err +rm err + +cd xml +alto -o $corpus.data.dz *.xml 2> /dev/null + +cd ../.. +rm -fr out + +rm -f lock diff --git a/NieuwsNL/Makefile b/NieuwsNL/Makefile new file mode 100644 index 0000000..3e286e0 --- /dev/null +++ b/NieuwsNL/Makefile @@ -0,0 +1,9 @@ +all: \ + metadata \ + nieuwsnl + +metadata: cmd/metadata/*.go + go build -o $@ $^ + +nieuwsnl: cmd/nieuwsnl/*.go + go build -o $@ $^ diff --git a/NieuwsNL/cmd/metadata/metadata.go b/NieuwsNL/cmd/metadata/metadata.go new file mode 100644 index 0000000..b011d50 --- /dev/null +++ b/NieuwsNL/cmd/metadata/metadata.go @@ -0,0 +1,126 @@ +package main + +import ( + "github.com/pebbe/util" + + "bufio" + "encoding/xml" + "fmt" + "html" + "os" + "strings" + "time" +) + +type Item struct { + XMLName xml.Name `xml:"item"` + UnixTime int64 `xml:"unixTime"` +} + +var ( + x = util.CheckErr + escape = html.EscapeString + data = make(map[string][]string) + location *time.Location +) + +func main() { + var err error + location, err = time.LoadLocation("Europe/Amsterdam") + x(err) + + files, err := os.ReadDir("..") + x(err) + for _, file := range files { + filename := file.Name() + if strings.HasSuffix(filename, ".txt") { + doText(filename) + } else if strings.HasSuffix(filename, ".xml") { + doXml(filename) + } + } + + files, err = os.ReadDir("xml") + x(err) + for _, file := range files { + filename := file.Name() + if !strings.HasSuffix(filename, ".xml") { + continue + } + aa := strings.Split(filename, ".") + base := strings.Join(aa[1:len(aa)-2], ".") + b, err := os.ReadFile("xml/" + filename) + x(err) + s := string(b) + i := strings.Index(s, "\n \n") + x(err) + for _, m := range data[base] { + _, err = fp.WriteString(" " + m + "\n") + x(err) + } + _, err = fp.WriteString(" \n ") + x(err) + _, err = fp.WriteString(stripMeta(s[i:])) + x(err) + x(fp.Close()) + x(os.Rename("xml/"+filename+".tmp", "xml/"+filename)) + } +} + +func doText(filename string) { + base := filename[:len(filename)-4] + if _, ok := data[base]; !ok { + data[base] = make([]string, 0) + } + fp, err := os.Open("../" + filename) + x(err) + defer func() { x(fp.Close()) }() + scanner := bufio.NewScanner(fp) + for scanner.Scan() { + line := scanner.Text() + if !strings.HasPrefix(line, "##META") { + continue + } + aa := strings.Fields(line) + if len(aa) > 4 { + data[base] = append(data[base], + fmt.Sprintf(``, + aa[1], + escape(aa[2]), + escape(strings.Join(aa[4:], " ")))) + } + } + x(scanner.Err()) +} + +func doXml(filename string) { + base := filename[:len(filename)-4] + if _, ok := data[base]; !ok { + data[base] = make([]string, 0) + } + b, err := os.ReadFile("../" + filename) + x(err) + var item Item + x(xml.Unmarshal(b, &item)) + t := time.Unix(item.UnixTime, 0).In(location) + data[base] = append(data[base], + fmt.Sprintf(``, + t.Year(), + int(t.Month()), + t.Day())) +} + +func stripMeta(s string) string { + i1 := strings.Index(s, "") + if i1 < 0 { + return s + } + i2 := i1 + strings.Index(s[i1:], "") + 11 + return s[:i1] + strings.TrimLeft(s[i2:], " \t\r\n") +} diff --git a/NieuwsNL/cmd/nieuwsnl/nieuwsnl.go b/NieuwsNL/cmd/nieuwsnl/nieuwsnl.go new file mode 100644 index 0000000..c8d986e --- /dev/null +++ b/NieuwsNL/cmd/nieuwsnl/nieuwsnl.go @@ -0,0 +1,217 @@ +package main + +import ( + "github.com/jbowtie/gokogiri" + "github.com/pebbe/util" + + "bytes" + "encoding/xml" + "fmt" + "io" + "net/http" + "net/url" + "os" + "strings" + "time" +) + +type Rss struct { + XMLName xml.Name `xml:"rss"` + Items []ItemT `xml:"channel>item"` +} + +type ItemT struct { + Title string `xml:"title"` + PubDate string `xml:"pubDate"` + UnixTime int64 `xml:"unixTime"` + Guid string `xml:"guid"` + Link string `xml:"link"` + Data []byte `xml:",innerxml"` +} + +var ( + x = util.CheckErr + w = util.WarnErr + agent = "AhrefsBot/7.0" +) + +func exists(filename string) bool { + _, err := os.Stat(filename) + return err == nil +} + +func fileDate(filename string) string { + b, err := os.ReadFile(filename) + if err != nil { + return "" + } + s := string(b) + i1 := strings.Index(s, "") + 10 + i2 := strings.Index(s, "") + return s[i1:i2] +} + +func main() { + req, err := http.NewRequest("GET", "https://nieuws.nl/sitemap/news.xml", nil) + x(err) + req.Header.Set("User-Agent", agent) + + client := &http.Client{} + resp, err := client.Do(req) + x(err) + body, err := io.ReadAll(resp.Body) + x(err) + x(resp.Body.Close()) + + var rss Rss + x(xml.Unmarshal(body, &rss)) + + if len(rss.Items) == 0 { + x(fmt.Errorf("len(rss.Items) == 0")) + } + + for _, item := range rss.Items { + t, err := time.Parse(time.RFC1123Z, item.PubDate) + if err != nil { + t, err = time.Parse(time.RFC1123, item.PubDate) + } + x(err) + dirname := fmt.Sprintf("/net/corpora/nlnieuws/NieuwsNL/%d/%02d/%02d", t.Year(), int(t.Month()), t.Day()) + filename := dirname + "/" + url.PathEscape(strings.TrimPrefix(item.Guid, "urn:uuid:")) + + ts := fmt.Sprintf("%d", t.Unix()) + needUpdate := fileDate(filename+".xml") != ts + + x(os.MkdirAll(dirname, 0777)) + fp, err := os.Create(filename + ".xml") + x(err) + _, err = fp.WriteString("\n\n") + x(err) + _, err = fmt.Fprintf(fp, "%d", t.Unix()) + x(err) + _, err = fp.Write(item.Data) + x(err) + _, err = fp.WriteString("\n") + x(err) + x(fp.Close()) + x(os.Chtimes(filename+".xml", t, t)) + if !doArticle(filename, item.Link, item.Title, t, needUpdate) { + x(os.Remove(filename + ".xml")) + } + } +} + +func doArticle(filename string, url string, title string, timestamp time.Time, needUpdate bool) bool { + if needUpdate { + _ = os.Remove(filename + ".err") + _ = os.Remove(filename + ".html") + _ = os.Remove(filename + ".txt") + _ = os.Remove(filename + ".skip") + } else { + if exists(filename+".txt") || exists(filename+".skip") { + return true + } + } + time.Sleep(2 * time.Second) + + req, err := http.NewRequest("GET", url, nil) + x(err) + req.Header.Set("User-Agent", agent) + + client := &http.Client{} + resp, err := client.Do(req) + x(err) + body, err := io.ReadAll(resp.Body) + x(err) + x(resp.Body.Close()) + + doc, err := gokogiri.ParseHtml(body) + x(err) + + var buf bytes.Buffer + fouten := make([]string, 0) + + root := doc.Root() + + var cat string + aa, err := root.Search(`//a[contains(@class, "articleHeader__info__category")]`) + x(err) + if len(aa) == 0 { + _, err = fmt.Fprintln(&buf, "##META text cat =") + x(err) + _ = w(fmt.Errorf("no cat: %s", url)) + // geen fout, maar waarschuwing als er meer fouten zijn + fouten = append(fouten, fmt.Sprintf("no text: %s\n", url)) + // dus geen return false + } else { + for _, a := range aa { + cat = strings.ReplaceAll(a.Content(), "\n", " ") + _, err = fmt.Fprintf(&buf, "##META text cat = %s\n", cat) + x(err) + } + } + + _, err = buf.WriteString(addEnd(title)) + x(err) + + // oud: //div[@id="article-blocks"]//p + pp, err := root.Search(`//div[@id="article-blocks"]//div[contains(@class, "paragraph-content")]`) + x(err) + if len(pp) == 0 { + _ = w(fmt.Errorf("empty: %s", url)) + // dit is echt fout + fouten = append(fouten, fmt.Sprintf("empty: %s\n", url)) + + fp, err := os.Create(filename + ".err") + x(err) + for _, fout := range fouten { + _, err = fp.WriteString(fout) + x(err) + } + x(fp.Close()) + x(os.Chtimes(filename+".err", timestamp, timestamp)) + + fp, err = os.Create(filename + ".html") + x(err) + _, err = fp.Write(body) + x(err) + x(fp.Close()) + x(os.Chtimes(filename+".html", timestamp, timestamp)) + + return false // echt fout + } + for _, p := range pp { + _, err = buf.WriteString(addEnd(p.Content())) + x(err) + } + + fp, err := os.Create(filename + ".txt") + x(err) + _, err = fp.Write(buf.Bytes()) + x(err) + x(fp.Close()) + + x(os.Chtimes(filename+".txt", timestamp, timestamp)) + + return true +} + +func addEnd(s string) string { + s = strings.TrimSpace(s) + n := len(s) + if n == 0 { + return "" + } + if n > 0 { + if strings.ContainsAny(s[n-1:], ".!?") { + return s + "\n" + } + } + if n > 1 { + s2 := s[n-2:] + if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` { + return s + "\n" + } + } + return s + ".\n" +} diff --git a/NieuwsNL/txt2corpus.sh b/NieuwsNL/txt2corpus.sh new file mode 100755 index 0000000..d794a73 --- /dev/null +++ b/NieuwsNL/txt2corpus.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +set -e + +unset CDPATH +PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH +export TZ=Europe/Amsterdam +. /net/aps/etc/alpino-activate.sh > /dev/null + +if [ "$1" = "" ] +then + # nieuws.nl gaat per dag, niet per week + # dus gegevens van 2 dagen geleden, niet een week geleden + ds=`ISODate -2` +else + case "$1" in + 2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9]) + ds=$1 + ;; + *) + echo INVALID + exit 1 + ;; + esac +fi + +dp=${ds//-//} + +corpus=/net/corpora/nlnieuws/NieuwsNL/corpus/$ds + +cd /net/corpora/nlnieuws/NieuwsNL/$dp + +ln -s lock.$$ lock +if [ "`readlink lock`" != lock.$$ ] +then + echo Getting lock failed + exit 1 +fi + +rm -fr out +mkdir out + +rm -f $corpus.lines +for i in *.txt +do + b=`basename $i .txt` + perl -p -e 's/^\s*//; s/^##META.*\n//' $i | tokenize.sh \ + | perl -e '$n = 0; while(<>) { $n++; print("nnl.'$b'.$n|$_"); }' \ + >> $corpus.lines +done + +cd out +mkdir xml +Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log + +../../../../metadata 2> err +rm err + +cd xml +alto -o $corpus.data.dz *.xml 2> /dev/null + +cd ../.. +rm -fr out + +rm -f lock diff --git a/RO/Makefile b/RO/Makefile new file mode 100644 index 0000000..c3d8f3f --- /dev/null +++ b/RO/Makefile @@ -0,0 +1,13 @@ +all: \ + xml2txt \ + metadata \ + ro + +xml2txt: cmd/xml2txt/*.go + go build -o $@ $^ + +metadata: cmd/metadata/*.go + go build -o $@ $^ + +ro: cmd/ro/*.go + go build -o $@ $^ diff --git a/RO/cmd/metadata/metadata.go b/RO/cmd/metadata/metadata.go new file mode 100644 index 0000000..eb04fd0 --- /dev/null +++ b/RO/cmd/metadata/metadata.go @@ -0,0 +1,136 @@ +package main + +import ( + "github.com/pebbe/util" + + "bufio" + "encoding/xml" + "fmt" + "html" + "os" + "strings" + "time" +) + +type Item struct { + XMLName xml.Name `xml:"item"` + UnixTime int64 `xml:"unixTime"` +} + +var ( + x = util.CheckErr + escape = html.EscapeString + data = make(map[string][]string) + location *time.Location +) + +func main() { + var err error + location, err = time.LoadLocation("Europe/Amsterdam") + x(err) + + files, err := os.ReadDir(".") + x(err) + for _, file := range files { + filename := file.Name() + if strings.HasSuffix(filename, ".txt") { + doText("", filename) + } else if strings.HasSuffix(filename, ".xml") { + doXml("", filename) + } + } + files, err = os.ReadDir("..") + x(err) + for _, file := range files { + filename := file.Name() + if strings.HasSuffix(filename, ".txt") { + doText("../", filename) + } else if strings.HasSuffix(filename, ".xml") { + doXml("../", filename) + } + } + + files, err = os.ReadDir("xml") + x(err) + for _, file := range files { + filename := file.Name() + if !strings.HasSuffix(filename, ".xml") { + continue + } + aa := strings.Split(filename, ".") + base := strings.Join(aa[1:len(aa)-2], ".") + b, err := os.ReadFile("xml/" + filename) + x(err) + s := string(b) + i := strings.Index(s, "\n \n") + x(err) + for _, m := range data[base] { + _, err = fp.WriteString(" " + m + "\n") + x(err) + } + _, err = fp.WriteString(" \n ") + x(err) + _, err = fp.WriteString(stripMeta(s[i:])) + x(err) + x(fp.Close()) + x(os.Rename("xml/"+filename+".tmp", "xml/"+filename)) + } +} + +func doText(dirname, filename string) { + base := filename[:len(filename)-4] + if _, ok := data[base]; !ok { + data[base] = make([]string, 0) + } + fp, err := os.Open(dirname + filename) + x(err) + defer func() { x(fp.Close()) }() + scanner := bufio.NewScanner(fp) + for scanner.Scan() { + line := scanner.Text() + if !strings.HasPrefix(line, "##META") { + continue + } + aa := strings.Fields(line) + if len(aa) > 4 { + data[base] = append(data[base], + fmt.Sprintf(``, + aa[1], + escape(aa[2]), + escape(strings.Join(aa[4:], " ")))) + } + } + x(scanner.Err()) +} + +func doXml(dirname, filename string) { + base := filename[:len(filename)-4] + if _, ok := data[base]; !ok { + data[base] = make([]string, 0) + } + b, err := os.ReadFile(dirname + filename) + x(err) + var item Item + x(xml.Unmarshal(b, &item)) + t := time.Unix(item.UnixTime, 0).In(location) + data[base] = append(data[base], + fmt.Sprintf(``, + t.Year(), + int(t.Month()), + t.Day())) +} + +func stripMeta(s string) string { + i1 := strings.Index(s, "") + if i1 < 0 { + return s + } + i2 := i1 + strings.Index(s[i1:], "") + 11 + return s[:i1] + strings.TrimLeft(s[i2:], " \t\r\n") +} diff --git a/RO/cmd/ro/ro.go b/RO/cmd/ro/ro.go new file mode 100644 index 0000000..1a45943 --- /dev/null +++ b/RO/cmd/ro/ro.go @@ -0,0 +1,81 @@ +package main + +import ( + "github.com/pebbe/util" + + "encoding/xml" + "fmt" + "io" + "net/http" + "net/url" + "os" + "strings" + "time" +) + +type Rss struct { + XMLName xml.Name `xml:"rss"` + Items []ItemT `xml:"channel>item"` +} + +type ItemT struct { + PubDate string `xml:"pubDate"` + UnixTime int64 `xml:"unixTime"` + Guid string `xml:"guid"` + Data []byte `xml:",innerxml"` +} + +var ( + x = util.CheckErr + agent = "AhrefsBot/7.0" +) + +func main() { + req, err := http.NewRequest("GET", "https://reportersonline.nl/feed/", nil) + x(err) + req.Header.Set("User-Agent", agent) + + client := &http.Client{} + resp, err := client.Do(req) + x(err) + body, err := io.ReadAll(resp.Body) + x(err) + x(resp.Body.Close()) + + var rss Rss + x(xml.Unmarshal(body, &rss)) + + if len(rss.Items) == 0 { + x(fmt.Errorf("len(rss.Items) == 0")) + } + + for _, item := range rss.Items { + t, err := time.Parse(time.RFC1123Z, item.PubDate) + if err != nil { + t, err = time.Parse(time.RFC1123, item.PubDate) + } + x(err) + year, week := t.ISOWeek() + dirname := fmt.Sprintf("/net/corpora/nlnieuws/RO/%d/%02d", year, week) + basename := strings.TrimPrefix(item.Guid, "https://reportersonline.nl/?p=") + if i := strings.LastIndex(basename, "/"); i > 0 { + basename = basename[:i] + } + filename := dirname + "/" + url.PathEscape(basename) + + x(os.MkdirAll(dirname, 0777)) + fp, err := os.Create(filename + ".xml") + x(err) + _, err = fp.WriteString("\n\n") + x(err) + _, err = fmt.Fprintf(fp, "%d", t.Unix()) + x(err) + _, err = fp.Write(item.Data) + x(err) + _, err = fp.WriteString("\n") + x(err) + x(fp.Close()) + x(os.Chtimes(filename+".xml", t, t)) + } + +} diff --git a/RO/cmd/xml2txt/xml2txt.go b/RO/cmd/xml2txt/xml2txt.go new file mode 100644 index 0000000..5631268 --- /dev/null +++ b/RO/cmd/xml2txt/xml2txt.go @@ -0,0 +1,102 @@ +package main + +import ( + "github.com/jbowtie/gokogiri" + "github.com/pebbe/util" + + "encoding/xml" + "fmt" + "os" + "regexp" + "strings" + "time" +) + +type Item struct { + Title string `xml:"title"` + Text string `xml:"encoded"` + Cats []string `xml:"category"` +} + +var ( + x = util.CheckErr + + reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`) +) + +func main() { + + var ds string + switch len(os.Args) { + case 1: + year, week := time.Now().AddDate(0, 0, -7).ISOWeek() + ds = fmt.Sprintf("%d-%02d", year, week) + case 2: + if !reYearWeek.MatchString(os.Args[1]) { + x(fmt.Errorf("arg must be yyyy-ww")) + } + ds = os.Args[1] + default: + x(fmt.Errorf("too many arguments")) + } + dp := ds[:4] + "/" + ds[5:] + + x(os.Chdir("/net/corpora/nlnieuws/RO/" + dp)) + x(os.MkdirAll("out", 0777)) + files, err := os.ReadDir(".") + x(err) + for _, file := range files { + filename := file.Name() + if !strings.HasSuffix(filename, ".xml") { + continue + } + b, err := os.ReadFile(filename) + x(err) + fp, err := os.Create("out/" + filename[:len(filename)-4] + ".txt") + x(err) + var item Item + x(xml.Unmarshal(b, &item)) + for _, cat := range item.Cats { + _, err = fmt.Fprintf(fp, "##META text cat = %s\n", cat) + x(err) + } + _, err = fp.WriteString(addEnd(item.Title)) + x(err) + doc, err := gokogiri.ParseHtml([]byte(`` + item.Text + ``)) + x(err) + root := doc.Root() + divs, err := root.Search(`//div[@class="donatieformlinks"]`) + x(err) + for _, div := range divs { + div.Remove() + } + pp, err := root.Search(`//body//p[not(.//a[contains(@href,"reportersonline.nl/support")])]`) + x(err) + for _, p := range pp { + _, err = fp.WriteString(addEnd(p.Content())) + x(err) + } + x(err) + x(fp.Close()) + } +} + +func addEnd(s string) string { + s = strings.TrimSpace(s) + n := len(s) + if n == 0 { + return "" + } + if n > 0 { + if strings.ContainsAny(s[n-1:], ".!?") { + return s + "\n" + } + } + if n > 1 { + s2 := s[n-2:] + if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` { + return s + "\n" + } + } + return s + ".\n" +} diff --git a/RO/txt2corpus.sh b/RO/txt2corpus.sh new file mode 100755 index 0000000..b7f7e20 --- /dev/null +++ b/RO/txt2corpus.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +set -e + +unset CDPATH +PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH +export TZ=Europe/Amsterdam +. /net/aps/etc/alpino-activate.sh > /dev/null + +if [ "$1" = "" ] +then + ds=`ISODate -7` +else + case "$1" in + 2[0-9][0-9][0-9]-[0-5][0-9]) + ds=$1 + ;; + *) + echo INVALID + exit 1 + ;; + esac +fi + +dp=${ds//-//} + +corpus=/net/corpora/nlnieuws/RO/corpus/$ds + +cd /net/corpora/nlnieuws/RO/$dp + +ln -s lock.$$ lock +if [ "`readlink lock`" != lock.$$ ] +then + echo Getting lock failed + exit 1 +fi + +rm -fr out +mkdir out + +../../xml2txt $ds + +rm -f $corpus.lines +for i in out/*.txt +do + b=`basename $i .txt` + perl -p -e 's/^\s*//; s/^##META.*\n//' $i | tokenize.sh \ + | perl -e '$n = 0; while(<>) { $n++; print("ro.'$b'.$n|$_"); }' \ + >> $corpus.lines +done + +cd out +mkdir xml +Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log + +../../../metadata 2> err +rm err + +cd xml +alto -o $corpus.data.dz *.xml 2> /dev/null + +cd ../.. +rm -fr out + +rm -f lock diff --git a/Sargasso/Makefile b/Sargasso/Makefile new file mode 100644 index 0000000..39e8f87 --- /dev/null +++ b/Sargasso/Makefile @@ -0,0 +1,13 @@ +all: \ + xml2txt \ + metadata \ + sargasso + +xml2txt: cmd/xml2txt/*.go + go build -o $@ $^ + +metadata: cmd/metadata/*.go + go build -o $@ $^ + +sargasso: cmd/sargasso/*.go + go build -o $@ $^ diff --git a/Sargasso/cmd/metadata/metadata.go b/Sargasso/cmd/metadata/metadata.go new file mode 100644 index 0000000..7b134e4 --- /dev/null +++ b/Sargasso/cmd/metadata/metadata.go @@ -0,0 +1,136 @@ +package main + +import ( + "github.com/pebbe/util" + + "bufio" + "encoding/xml" + "fmt" + "html" + "os" + "strings" + "time" +) + +type Item struct { + XMLName xml.Name `xml:"item"` + UnixTime int64 `xml:"unixTime"` +} + +var ( + x = util.CheckErr + escape = html.EscapeString + data = make(map[string][]string) + location *time.Location +) + +func main() { + var err error + location, err = time.LoadLocation("Europe/Amsterdam") + x(err) + + files, err := os.ReadDir(".") + x(err) + for _, file := range files { + filename := file.Name() + if strings.HasSuffix(filename, ".txt") { + doText("", filename) + } else if strings.HasSuffix(filename, ".xml") { + doXml("", filename) + } + } + files, err = os.ReadDir("..") + x(err) + for _, file := range files { + filename := file.Name() + if strings.HasSuffix(filename, ".txt") { + doText("../", filename) + } else if strings.HasSuffix(filename, ".xml") { + doXml("../", filename) + } + } + + files, err = os.ReadDir("xml") + x(err) + for _, file := range files { + filename := file.Name() + if !strings.HasSuffix(filename, ".xml") { + continue + } + aa := strings.Split(filename, ".") + base := strings.Join(aa[1:len(aa)-2], ".") + b, err := os.ReadFile("xml/" + filename) + x(err) + s := string(b) + i := strings.Index(s, "\n \n") + x(err) + for _, m := range data[base] { + _, err = fp.WriteString(" " + m + "\n") + x(err) + } + _, err = fp.WriteString(" \n ") + x(err) + _, err = fp.WriteString(stripMeta(s[i:])) + x(err) + x(fp.Close()) + x(os.Rename("xml/"+filename+".tmp", "xml/"+filename)) + } +} + +func doText(dirname, filename string) { + base := filename[:len(filename)-4] + if _, ok := data[base]; !ok { + data[base] = make([]string, 0) + } + fp, err := os.Open(dirname + filename) + x(err) + defer func() { x(fp.Close()) }() + scanner := bufio.NewScanner(fp) + for scanner.Scan() { + line := scanner.Text() + if !strings.HasPrefix(line, "##META") { + continue + } + aa := strings.Fields(line) + if len(aa) > 4 { + data[base] = append(data[base], + fmt.Sprintf(``, + aa[1], + escape(aa[2]), + escape(strings.Join(aa[4:], " ")))) + } + } + x(scanner.Err()) +} + +func doXml(dirname, filename string) { + base := filename[:len(filename)-4] + if _, ok := data[base]; !ok { + data[base] = make([]string, 0) + } + b, err := os.ReadFile(dirname + filename) + x(err) + var item Item + x(xml.Unmarshal(b, &item)) + t := time.Unix(item.UnixTime, 0).In(location) + data[base] = append(data[base], + fmt.Sprintf(``, + t.Year(), + int(t.Month()), + t.Day())) +} + +func stripMeta(s string) string { + i1 := strings.Index(s, "") + if i1 < 0 { + return s + } + i2 := i1 + strings.Index(s[i1:], "") + 11 + return s[:i1] + strings.TrimLeft(s[i2:], " \t\r\n") +} diff --git a/Sargasso/cmd/sargasso/sargasso.go b/Sargasso/cmd/sargasso/sargasso.go new file mode 100644 index 0000000..8110fb6 --- /dev/null +++ b/Sargasso/cmd/sargasso/sargasso.go @@ -0,0 +1,81 @@ +package main + +import ( + "github.com/pebbe/util" + + "encoding/xml" + "fmt" + "io" + "net/http" + "net/url" + "os" + "strings" + "time" +) + +type Rss struct { + XMLName xml.Name `xml:"rss"` + Items []ItemT `xml:"channel>item"` +} + +type ItemT struct { + PubDate string `xml:"pubDate"` + UnixTime int64 `xml:"unixTime"` + Guid string `xml:"guid"` + Data []byte `xml:",innerxml"` +} + +var ( + x = util.CheckErr + agent = "AhrefsBot/7.0" +) + +func main() { + req, err := http.NewRequest("GET", "https://sargasso.nl/feed/", nil) + x(err) + req.Header.Set("User-Agent", agent) + + client := &http.Client{} + resp, err := client.Do(req) + x(err) + body, err := io.ReadAll(resp.Body) + x(err) + x(resp.Body.Close()) + + var rss Rss + x(xml.Unmarshal(body, &rss)) + + if len(rss.Items) == 0 { + x(fmt.Errorf("len(rss.Items) == 0")) + } + + for _, item := range rss.Items { + t, err := time.Parse(time.RFC1123Z, item.PubDate) + if err != nil { + t, err = time.Parse(time.RFC1123, item.PubDate) + } + x(err) + year, week := t.ISOWeek() + dirname := fmt.Sprintf("/net/corpora/nlnieuws/Sargasso/%d/%02d", year, week) + basename := strings.TrimPrefix(item.Guid, "https://sargasso.nl/?") + if i := strings.LastIndex(basename, "p="); i >= 0 { + basename = basename[i+2:] + } + filename := dirname + "/" + url.PathEscape(basename) + + x(os.MkdirAll(dirname, 0777)) + fp, err := os.Create(filename + ".xml") + x(err) + _, err = fp.WriteString("\n\n") + x(err) + _, err = fmt.Fprintf(fp, "%d", t.Unix()) + x(err) + _, err = fp.Write(item.Data) + x(err) + _, err = fp.WriteString("\n") + x(err) + x(fp.Close()) + x(os.Chtimes(filename+".xml", t, t)) + } + +} diff --git a/Sargasso/cmd/xml2txt/xml2txt.go b/Sargasso/cmd/xml2txt/xml2txt.go new file mode 100644 index 0000000..61fd657 --- /dev/null +++ b/Sargasso/cmd/xml2txt/xml2txt.go @@ -0,0 +1,97 @@ +package main + +import ( + "github.com/jbowtie/gokogiri" + "github.com/pebbe/util" + + "encoding/xml" + "fmt" + "os" + "regexp" + "strings" + "time" +) + +type Item struct { + Title string `xml:"title"` + Text string `xml:"encoded"` + Cats []string `xml:"category"` +} + +var ( + x = util.CheckErr + + reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`) +) + +func main() { + + var ds string + switch len(os.Args) { + case 1: + year, week := time.Now().AddDate(0, 0, -7).ISOWeek() + ds = fmt.Sprintf("%d-%02d", year, week) + case 2: + if !reYearWeek.MatchString(os.Args[1]) { + x(fmt.Errorf("arg must be yyyy-ww")) + } + ds = os.Args[1] + default: + x(fmt.Errorf("too many arguments")) + } + dp := ds[:4] + "/" + ds[5:] + + x(os.Chdir("/net/corpora/nlnieuws/Sargasso/" + dp)) + x(os.MkdirAll("out", 0777)) + files, err := os.ReadDir(".") + x(err) + for _, file := range files { + filename := file.Name() + if !strings.HasSuffix(filename, ".xml") { + continue + } + b, err := os.ReadFile(filename) + x(err) + fp, err := os.Create("out/" + filename[:len(filename)-4] + ".txt") + x(err) + var item Item + x(xml.Unmarshal(b, &item)) + for _, cat := range item.Cats { + _, err = fmt.Fprintf(fp, "##META text cat = %s\n", cat) + x(err) + } + _, err = fp.WriteString(addEnd(item.Title)) + x(err) + doc, err := gokogiri.ParseHtml([]byte(`` + item.Text + ``)) + x(err) + root := doc.Root() + pp, err := root.Search(`//body//p`) + x(err) + for _, p := range pp { + _, err = fp.WriteString(addEnd(p.Content())) + x(err) + } + x(err) + x(fp.Close()) + } +} + +func addEnd(s string) string { + s = strings.TrimSpace(s) + n := len(s) + if n == 0 { + return "" + } + if n > 0 { + if strings.ContainsAny(s[n-1:], ".!?") { + return s + "\n" + } + } + if n > 1 { + s2 := s[n-2:] + if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` { + return s + "\n" + } + } + return s + ".\n" +} diff --git a/Sargasso/txt2corpus.sh b/Sargasso/txt2corpus.sh new file mode 100755 index 0000000..d656205 --- /dev/null +++ b/Sargasso/txt2corpus.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +set -e + +unset CDPATH +PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH +export TZ=Europe/Amsterdam +. /net/aps/etc/alpino-activate.sh > /dev/null + +if [ "$1" = "" ] +then + ds=`ISODate -7` +else + case "$1" in + 2[0-9][0-9][0-9]-[0-5][0-9]) + ds=$1 + ;; + *) + echo INVALID + exit 1 + ;; + esac +fi + +dp=${ds//-//} + +corpus=/net/corpora/nlnieuws/Sargasso/corpus/$ds + +cd /net/corpora/nlnieuws/Sargasso/$dp + +ln -s lock.$$ lock +if [ "`readlink lock`" != lock.$$ ] +then + echo Getting lock failed + exit 1 +fi + +rm -fr out +mkdir out + +../../xml2txt $ds + +rm -f $corpus.lines +for i in out/*.txt +do + b=`basename $i .txt` + perl -p -e 's/^\s*//; s/^##META.*\n//' $i | tokenize.sh \ + | perl -e '$n = 0; while(<>) { $n++; print("sargasso.'$b'.$n|$_"); }' \ + >> $corpus.lines +done + +cd out +mkdir xml +Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log + +../../../metadata 2> err +rm err + +cd xml +alto -o $corpus.data.dz *.xml 2> /dev/null + +cd ../.. +rm -fr out + +rm -f lock diff --git a/Sikkom/Makefile b/Sikkom/Makefile new file mode 100644 index 0000000..7ec39b7 --- /dev/null +++ b/Sikkom/Makefile @@ -0,0 +1,9 @@ +all: \ + metadata \ + sikkom + +metadata: cmd/metadata/*.go + go build -o $@ $^ + +sikkom: cmd/sikkom/*.go + go build -o $@ $^ diff --git a/Sikkom/cmd/metadata/metadata.go b/Sikkom/cmd/metadata/metadata.go new file mode 100644 index 0000000..fd787db --- /dev/null +++ b/Sikkom/cmd/metadata/metadata.go @@ -0,0 +1,95 @@ +package main + +import ( + "github.com/pebbe/util" + + "encoding/xml" + "fmt" + "os" + "strings" + "time" +) + +type Item struct { + XMLName xml.Name `xml:"item"` + UnixTime int64 `xml:"unixTime"` +} + +var ( + x = util.CheckErr + data = make(map[string][]string) + location *time.Location +) + +func main() { + var err error + location, err = time.LoadLocation("Europe/Amsterdam") + x(err) + + files, err := os.ReadDir("..") + x(err) + for _, file := range files { + filename := file.Name() + if strings.HasSuffix(filename, ".xml") { + doXml(filename) + } + } + + files, err = os.ReadDir("xml") + x(err) + for _, file := range files { + filename := file.Name() + if !strings.HasSuffix(filename, ".xml") { + continue + } + aa := strings.Split(filename, ".") + base := strings.Join(aa[1:len(aa)-2], ".") + b, err := os.ReadFile("xml/" + filename) + x(err) + s := string(b) + i := strings.Index(s, "\n \n") + x(err) + for _, m := range data[base] { + _, err = fp.WriteString(" " + m + "\n") + x(err) + } + _, err = fp.WriteString(" \n ") + x(err) + _, err = fp.WriteString(stripMeta(s[i:])) + x(err) + x(fp.Close()) + x(os.Rename("xml/"+filename+".tmp", "xml/"+filename)) + } +} + +func doXml(filename string) { + base := filename[:len(filename)-4] + if _, ok := data[base]; !ok { + data[base] = make([]string, 0) + } + b, err := os.ReadFile("../" + filename) + x(err) + var item Item + x(xml.Unmarshal(b, &item)) + t := time.Unix(item.UnixTime, 0).In(location) + data[base] = append(data[base], + fmt.Sprintf(``, + t.Year(), + int(t.Month()), + t.Day())) +} + +func stripMeta(s string) string { + i1 := strings.Index(s, "") + if i1 < 0 { + return s + } + i2 := i1 + strings.Index(s[i1:], "") + 11 + return s[:i1] + strings.TrimLeft(s[i2:], " \t\r\n") +} diff --git a/Sikkom/cmd/sikkom/sikkom.go b/Sikkom/cmd/sikkom/sikkom.go new file mode 100644 index 0000000..63b3586 --- /dev/null +++ b/Sikkom/cmd/sikkom/sikkom.go @@ -0,0 +1,228 @@ +package main + +import ( + "github.com/jbowtie/gokogiri" + "github.com/pebbe/util" + + "encoding/xml" + "fmt" + "io" + "net/http" + "net/url" + "os" + "strings" + "time" +) + +type Rss struct { + XMLName xml.Name `xml:"rss"` + Items []ItemT `xml:"channel>item"` +} + +type ItemT struct { + Title string `xml:"title"` + PubDate string `xml:"pubDate"` + UnixTime int64 `xml:"unixTime"` + Guid string `xml:"guid"` + Link string `xml:"link"` + Data []byte `xml:",innerxml"` +} + +var ( + x = util.CheckErr + w = util.WarnErr + agent = "AhrefsBot/7.0" +) + +func exists(filename string) bool { + _, err := os.Stat(filename) + return err == nil +} + +func fileDate(filename string) string { + b, err := os.ReadFile(filename) + if err != nil { + return "" + } + s := string(b) + i1 := strings.Index(s, "") + 10 + i2 := strings.Index(s, "") + return s[i1:i2] +} + +func main() { + resp, err := http.Get("https://www.sikkom.nl/api/feed/rss") + x(err) + body, err := io.ReadAll(resp.Body) + x(err) + x(resp.Body.Close()) + + var rss Rss + x(xml.Unmarshal(body, &rss)) + + if len(rss.Items) == 0 { + x(fmt.Errorf("len(rss.Items) == 0")) + } + + for _, item := range rss.Items { + t, err := time.Parse(time.RFC1123Z, item.PubDate) + if err != nil { + t, err = time.Parse(time.RFC1123, item.PubDate) + } + x(err) + year, week := t.ISOWeek() + dirname := fmt.Sprintf("/net/corpora/nlnieuws/Sikkom/%d/%02d", year, week) + filename := dirname + "/" + url.PathEscape(item.Guid) + + ts := fmt.Sprintf("%d", t.Unix()) + needUpdate := fileDate(filename+".xml") != ts + + x(os.MkdirAll(dirname, 0777)) + fp, err := os.Create(filename + ".xml") + x(err) + _, err = fp.WriteString("\n\n") + x(err) + _, err = fmt.Fprintf(fp, "%d", t.Unix()) + x(err) + _, err = fp.Write(item.Data) + x(err) + _, err = fp.WriteString("\n") + x(err) + x(fp.Close()) + x(os.Chtimes(filename+".xml", t, t)) + if !doArticle(filename, item.Link, item.Title, t, needUpdate) { + x(os.Remove(filename + ".xml")) + } + } +} + +func doArticle(filename string, url string, title string, timestamp time.Time, needUpdate bool) bool { + if needUpdate { + _ = os.Remove(filename + ".err") + _ = os.Remove(filename + ".html") + _ = os.Remove(filename + ".skip") + _ = os.Remove(filename + ".json") + _ = os.Remove(filename + ".txt") + } else { + if (exists(filename+".json") && exists(filename+".txt")) || exists(filename+".skip") { + return true + } + } + + time.Sleep(2 * time.Second) + + req, err := http.NewRequest("GET", url, nil) + x(err) + req.Header.Set("User-Agent", agent) + + client := &http.Client{} + resp, err := client.Do(req) + x(err) + body, err := io.ReadAll(resp.Body) + x(err) + x(resp.Body.Close()) + + s := string(body) + + ok := true + i1 := strings.Index(s, `"application/ld+json"`) + if i1 < 0 { + ok = false + } else { + i1 += strings.Index(s[i1:], `>`) + 1 + i2 := i1 + strings.Index(s[i1:], ``) + if i2 < i1 { + ok = false + } else { + s = s[i1:i2] + } + } + if !ok { + _ = w(fmt.Errorf("script jsonld not found: %s", url)) + + fp, err := os.Create(filename + ".err") + x(err) + _, err = fmt.Fprintf(fp, "script jsonld not found: %s\n", url) + x(err) + x(fp.Close()) + x(os.Chtimes(filename+".err", timestamp, timestamp)) + + fp, err = os.Create(filename + ".html") + x(err) + _, err = fp.Write(body) + x(err) + x(fp.Close()) + x(os.Chtimes(filename+".html", timestamp, timestamp)) + + return false + } + + fp, err := os.Create(filename + ".json") + x(err) + _, err = fp.WriteString(s) + x(err) + x(fp.Close()) + x(os.Chtimes(filename+".json", timestamp, timestamp)) + + doc, err := gokogiri.ParseHtml(body) + x(err) + + root := doc.Root() + + pp, err := root.Search(`//div[contains(@class,"article-page__body")]//p`) + x(err) + + if len(pp) == 0 { + _ = w(fmt.Errorf("empty: %s", url)) + + fp, err := os.Create(filename + ".err") + x(err) + _, err = fmt.Fprintf(fp, "empty: %s\n", url) + x(err) + x(fp.Close()) + x(os.Chtimes(filename+".err", timestamp, timestamp)) + + fp, err = os.Create(filename + ".html") + x(err) + _, err = fp.Write(body) + x(err) + x(fp.Close()) + x(os.Chtimes(filename+".html", timestamp, timestamp)) + + return false + } + + fp, err = os.Create(filename + ".txt") + x(err) + + _, err = fp.WriteString(addEnd(title)) + x(err) + + for _, p := range pp { + _, err = fp.WriteString(addEnd(p.Content())) + x(err) + } + + x(fp.Close()) + return true +} + +func addEnd(s string) string { + s = strings.TrimSpace(s) + n := len(s) + if n == 0 { + return "" + } + if n > 0 { + if strings.ContainsAny(s[n-1:], ".!?") { + return s + "\n" + } + } + if n > 1 { + s2 := s[n-2:] + if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` { + return s + "\n" + } + } + return s + ".\n" +} diff --git a/Sikkom/txt2corpus.sh b/Sikkom/txt2corpus.sh new file mode 100755 index 0000000..a511f18 --- /dev/null +++ b/Sikkom/txt2corpus.sh @@ -0,0 +1,63 @@ +#!/bin/bash + +set -e + +unset CDPATH +PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH +export TZ=Europe/Amsterdam +. /net/aps/etc/alpino-activate.sh > /dev/null + +if [ "$1" = "" ] +then + ds=`ISODate -7` +else + case "$1" in + 2[0-9][0-9][0-9]-[0-5][0-9]) + ds=$1 + ;; + *) + echo INVALID + exit 1 + ;; + esac +fi + +dp=${ds//-//} + +corpus=/net/corpora/nlnieuws/Sikkom/corpus/$ds + +cd /net/corpora/nlnieuws/Sikkom/$dp + +ln -s lock.$$ lock +if [ "`readlink lock`" != lock.$$ ] +then + echo Getting lock failed + exit 1 +fi + +rm -fr out +mkdir out + +rm -f $corpus.lines +for i in *.txt +do + b=`basename $i .txt` + perl -p -e 's/^\s*//; s/^##META.*\n//' $i | tokenize.sh \ + | perl -e '$n = 0; while(<>) { $n++; print("sikkom.'$b'.$n|$_"); }' \ + >> $corpus.lines +done + +cd out +mkdir xml +Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log + +../../../metadata 2> err +rm err + +cd xml +alto -o $corpus.data.dz *.xml 2> /dev/null + +cd ../.. +rm -fr out + +rm -f lock diff --git a/Tzum/Makefile b/Tzum/Makefile new file mode 100644 index 0000000..127b1a9 --- /dev/null +++ b/Tzum/Makefile @@ -0,0 +1,13 @@ +all: \ + xml2txt \ + metadata \ + tzum + +xml2txt: cmd/xml2txt/*.go + go build -o $@ $^ + +metadata: cmd/metadata/*.go + go build -o $@ $^ + +tzum: cmd/tzum/*.go + go build -o $@ $^ diff --git a/Tzum/cmd/metadata/metadata.go b/Tzum/cmd/metadata/metadata.go new file mode 100644 index 0000000..8a6851f --- /dev/null +++ b/Tzum/cmd/metadata/metadata.go @@ -0,0 +1,136 @@ +package main + +import ( + "github.com/pebbe/util" + + "bufio" + "encoding/xml" + "fmt" + "html" + "os" + "strings" + "time" +) + +type Item struct { + XMLName xml.Name `xml:"item"` + UnixTime int64 `xml:"unixTime"` +} + +var ( + x = util.CheckErr + escape = html.EscapeString + data = make(map[string][]string) + location *time.Location +) + +func main() { + var err error + location, err = time.LoadLocation("Europe/Amsterdam") + x(err) + + files, err := os.ReadDir(".") + x(err) + for _, file := range files { + filename := file.Name() + if strings.HasSuffix(filename, ".txt") { + doText("", filename) + } else if strings.HasSuffix(filename, ".xml") { + doXml("", filename) + } + } + files, err = os.ReadDir("..") + x(err) + for _, file := range files { + filename := file.Name() + if strings.HasSuffix(filename, ".txt") { + doText("../", filename) + } else if strings.HasSuffix(filename, ".xml") { + doXml("../", filename) + } + } + + files, err = os.ReadDir("xml") + x(err) + for _, file := range files { + filename := file.Name() + if !strings.HasSuffix(filename, ".xml") { + continue + } + aa := strings.Split(filename, ".") + base := strings.Join(aa[1:len(aa)-2], ".") + b, err := os.ReadFile("xml/" + filename) + x(err) + s := string(b) + i := strings.Index(s, "\n \n") + x(err) + for _, m := range data[base] { + _, err = fp.WriteString(" " + m + "\n") + x(err) + } + _, err = fp.WriteString(" \n ") + x(err) + _, err = fp.WriteString(stripMeta(s[i:])) + x(err) + x(fp.Close()) + x(os.Rename("xml/"+filename+".tmp", "xml/"+filename)) + } +} + +func doText(dirname, filename string) { + base := filename[:len(filename)-4] + if _, ok := data[base]; !ok { + data[base] = make([]string, 0) + } + fp, err := os.Open(dirname + filename) + x(err) + defer func() { x(fp.Close()) }() + scanner := bufio.NewScanner(fp) + for scanner.Scan() { + line := scanner.Text() + if !strings.HasPrefix(line, "##META") { + continue + } + aa := strings.Fields(line) + if len(aa) > 4 { + data[base] = append(data[base], + fmt.Sprintf(``, + aa[1], + escape(aa[2]), + escape(strings.Join(aa[4:], " ")))) + } + } + x(scanner.Err()) +} + +func doXml(dirname, filename string) { + base := filename[:len(filename)-4] + if _, ok := data[base]; !ok { + data[base] = make([]string, 0) + } + b, err := os.ReadFile(dirname + filename) + x(err) + var item Item + x(xml.Unmarshal(b, &item)) + t := time.Unix(item.UnixTime, 0).In(location) + data[base] = append(data[base], + fmt.Sprintf(``, + t.Year(), + int(t.Month()), + t.Day())) +} + +func stripMeta(s string) string { + i1 := strings.Index(s, "") + if i1 < 0 { + return s + } + i2 := i1 + strings.Index(s[i1:], "") + 11 + return s[:i1] + strings.TrimLeft(s[i2:], " \t\r\n") +} diff --git a/Tzum/cmd/tzum/tzum.go b/Tzum/cmd/tzum/tzum.go new file mode 100644 index 0000000..6fe95ff --- /dev/null +++ b/Tzum/cmd/tzum/tzum.go @@ -0,0 +1,81 @@ +package main + +import ( + "github.com/pebbe/util" + + "encoding/xml" + "fmt" + "io" + "net/http" + "net/url" + "os" + "strings" + "time" +) + +type Rss struct { + XMLName xml.Name `xml:"rss"` + Items []ItemT `xml:"channel>item"` +} + +type ItemT struct { + PubDate string `xml:"pubDate"` + UnixTime int64 `xml:"unixTime"` + Guid string `xml:"guid"` + Data []byte `xml:",innerxml"` +} + +var ( + x = util.CheckErr + agent = "AhrefsBot/7.0" +) + +func main() { + req, err := http.NewRequest("GET", "https://www.tzum.info/feed/", nil) + x(err) + req.Header.Set("User-Agent", agent) + + client := &http.Client{} + resp, err := client.Do(req) + x(err) + body, err := io.ReadAll(resp.Body) + x(err) + x(resp.Body.Close()) + + var rss Rss + x(xml.Unmarshal(body, &rss)) + + if len(rss.Items) == 0 { + x(fmt.Errorf("len(rss.Items) == 0")) + } + + for _, item := range rss.Items { + t, err := time.Parse(time.RFC1123Z, item.PubDate) + if err != nil { + t, err = time.Parse(time.RFC1123, item.PubDate) + } + x(err) + year, week := t.ISOWeek() + dirname := fmt.Sprintf("/net/corpora/nlnieuws/Tzum/%d/%02d", year, week) + basename := strings.TrimPrefix(item.Guid, "https://www.tzum.info/?p=") + if i := strings.LastIndex(basename, "/"); i > 0 { + basename = basename[:i] + } + filename := dirname + "/" + url.PathEscape(basename) + + x(os.MkdirAll(dirname, 0777)) + fp, err := os.Create(filename + ".xml") + x(err) + _, err = fp.WriteString("\n\n") + x(err) + _, err = fmt.Fprintf(fp, "%d", t.Unix()) + x(err) + _, err = fp.Write(item.Data) + x(err) + _, err = fp.WriteString("\n") + x(err) + x(fp.Close()) + x(os.Chtimes(filename+".xml", t, t)) + } + +} diff --git a/Tzum/cmd/xml2txt/xml2txt.go b/Tzum/cmd/xml2txt/xml2txt.go new file mode 100644 index 0000000..5e3d7e1 --- /dev/null +++ b/Tzum/cmd/xml2txt/xml2txt.go @@ -0,0 +1,100 @@ +package main + +import ( + "github.com/jbowtie/gokogiri" + "github.com/pebbe/util" + + "encoding/xml" + "fmt" + "os" + "regexp" + "strings" + "time" +) + +type Item struct { + Title string `xml:"title"` + Text string `xml:"encoded"` + Cats []string `xml:"category"` +} + +var ( + x = util.CheckErr + + reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`) +) + +func main() { + + var ds string + switch len(os.Args) { + case 1: + year, week := time.Now().AddDate(0, 0, -7).ISOWeek() + ds = fmt.Sprintf("%d-%02d", year, week) + case 2: + if !reYearWeek.MatchString(os.Args[1]) { + x(fmt.Errorf("arg must be yyyy-ww")) + } + ds = os.Args[1] + default: + x(fmt.Errorf("too many arguments")) + } + dp := ds[:4] + "/" + ds[5:] + + x(os.Chdir("/net/corpora/nlnieuws/Tzum/" + dp)) + x(os.MkdirAll("out", 0777)) + files, err := os.ReadDir(".") + x(err) + for _, file := range files { + filename := file.Name() + if !strings.HasSuffix(filename, ".xml") { + continue + } + b, err := os.ReadFile(filename) + x(err) + fp, err := os.Create("out/" + filename[:len(filename)-4] + ".txt") + x(err) + var item Item + x(xml.Unmarshal(b, &item)) + for _, cat := range item.Cats { + _, err = fmt.Fprintf(fp, "##META text cat = %s\n", cat) + x(err) + } + _, err = fp.WriteString(addEnd(item.Title)) + x(err) + doc, err := gokogiri.ParseHtml([]byte(`` + item.Text + ``)) + x(err) + root := doc.Root() + pp, err := root.Search(`//body/p`) + x(err) + for _, p := range pp { + s := p.Content() + if !strings.Contains(s, "verscheen eerst op Tzum.") { + _, err = fp.WriteString(addEnd(p.Content())) + x(err) + } + } + x(err) + x(fp.Close()) + } +} + +func addEnd(s string) string { + s = strings.TrimSpace(s) + n := len(s) + if n == 0 { + return "" + } + if n > 0 { + if strings.ContainsAny(s[n-1:], ".!?") { + return s + "\n" + } + } + if n > 1 { + s2 := s[n-2:] + if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` { + return s + "\n" + } + } + return s + ".\n" +} diff --git a/Tzum/txt2corpus.sh b/Tzum/txt2corpus.sh new file mode 100755 index 0000000..b078931 --- /dev/null +++ b/Tzum/txt2corpus.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +set -e + +unset CDPATH +PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH +export TZ=Europe/Amsterdam +. /net/aps/etc/alpino-activate.sh > /dev/null + +if [ "$1" = "" ] +then + ds=`ISODate -7` +else + case "$1" in + 2[0-9][0-9][0-9]-[0-5][0-9]) + ds=$1 + ;; + *) + echo INVALID + exit 1 + ;; + esac +fi + +dp=${ds//-//} + +corpus=/net/corpora/nlnieuws/Tzum/corpus/$ds + +cd /net/corpora/nlnieuws/Tzum/$dp + +ln -s lock.$$ lock +if [ "`readlink lock`" != lock.$$ ] +then + echo Getting lock failed + exit 1 +fi + +rm -fr out +mkdir out + +../../xml2txt $ds + +rm -f $corpus.lines +for i in out/*.txt +do + b=`basename $i .txt` + perl -p -e 's/^\s*//; s/^##META.*\n//' $i | tokenize.sh \ + | perl -e '$n = 0; while(<>) { $n++; print("tzum.'$b'.$n|$_"); }' \ + >> $corpus.lines +done + +cd out +mkdir xml +Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log + +../../../metadata 2> err +rm err + +cd xml +alto -o $corpus.data.dz *.xml 2> /dev/null + +cd ../.. +rm -fr out + +rm -f lock diff --git a/VRT/Makefile b/VRT/Makefile new file mode 100644 index 0000000..10bfb0f --- /dev/null +++ b/VRT/Makefile @@ -0,0 +1,9 @@ +all: \ + metadata \ + vrt + +metadata: cmd/metadata/*.go + go build -o $@ $^ + +vrt: cmd/vrt/*.go + go build -o $@ $^ diff --git a/VRT/cmd/metadata/metadata.go b/VRT/cmd/metadata/metadata.go new file mode 100644 index 0000000..0cc0262 --- /dev/null +++ b/VRT/cmd/metadata/metadata.go @@ -0,0 +1,126 @@ +package main + +import ( + "github.com/pebbe/util" + + "bufio" + "encoding/xml" + "fmt" + "html" + "os" + "strings" + "time" +) + +type Item struct { + XMLName xml.Name `xml:"item"` + UnixTime int64 `xml:"unixTime"` +} + +var ( + x = util.CheckErr + escape = html.EscapeString + data = make(map[string][]string) + location *time.Location +) + +func main() { + var err error + location, err = time.LoadLocation("Europe/Amsterdam") + x(err) + + files, err := os.ReadDir("..") + x(err) + for _, file := range files { + filename := file.Name() + if strings.HasSuffix(filename, ".txt") { + doText(filename) + } else if strings.HasSuffix(filename, ".xml") { + doXml(filename) + } + } + + files, err = os.ReadDir("xml") + x(err) + for _, file := range files { + filename := file.Name() + if !strings.HasSuffix(filename, ".xml") { + continue + } + aa := strings.Split(filename, ".") + base := strings.Join(aa[1:len(aa)-2], ".") + b, err := os.ReadFile("xml/" + filename) + x(err) + s := string(b) + i := strings.Index(s, "\n \n") + x(err) + for _, m := range data[base] { + _, err = fp.WriteString(" " + m + "\n") + x(err) + } + _, err = fp.WriteString(" \n ") + x(err) + _, err = fp.WriteString(stripMeta(s[i:])) + x(err) + x(fp.Close()) + x(os.Rename("xml/"+filename+".tmp", "xml/"+filename)) + } +} + +func doText(filename string) { + base := filename[:len(filename)-4] + if _, ok := data[base]; !ok { + data[base] = make([]string, 0) + } + fp, err := os.Open("../" + filename) + x(err) + defer func() { x(fp.Close()) }() + scanner := bufio.NewScanner(fp) + for scanner.Scan() { + line := scanner.Text() + if !strings.HasPrefix(line, "##META") { + continue + } + aa := strings.Fields(line) + if len(aa) > 4 { + data[base] = append(data[base], + fmt.Sprintf(``, + aa[1], + escape(aa[2]), + escape(strings.Join(aa[4:], " ")))) + } + } + x(scanner.Err()) +} + +func doXml(filename string) { + base := filename[:len(filename)-4] + if _, ok := data[base]; !ok { + data[base] = make([]string, 0) + } + b, err := os.ReadFile("../" + filename) + x(err) + var item Item + x(xml.Unmarshal(b, &item)) + t := time.Unix(item.UnixTime, 0).In(location) + data[base] = append(data[base], + fmt.Sprintf(``, + t.Year(), + int(t.Month()), + t.Day())) +} + +func stripMeta(s string) string { + i1 := strings.Index(s, "") + if i1 < 0 { + return s + } + i2 := i1 + strings.Index(s[i1:], "") + 11 + return s[:i1] + strings.TrimLeft(s[i2:], " \t\r\n") +} diff --git a/VRT/cmd/vrt/vrt.go b/VRT/cmd/vrt/vrt.go new file mode 100644 index 0000000..3224eca --- /dev/null +++ b/VRT/cmd/vrt/vrt.go @@ -0,0 +1,317 @@ +package main + +import ( + "github.com/jbowtie/gokogiri" + "github.com/pebbe/util" + + "bytes" + "encoding/xml" + "fmt" + "io" + "net/http" + "net/url" + "os" + "strings" + "time" +) + +type Rss struct { + XMLName xml.Name `xml:"feed"` + Items []ItemT `xml:"entry"` +} + +type ItemT struct { + Title TitleT `xml:"title"` + Published string `xml:"published"` + Updated string `xml:"updated"` + Nstag []string `xml:"nstag"` + Nslabeltag []string `xml:"nslabeltag"` + UnixTime int64 `xml:"unixTime"` + ID string `xml:"id"` + Link []LinkT `xml:"link"` + Data []byte `xml:",innerxml"` +} + +type TitleT struct { + Type string `xml:"type,attr"` + Text string `xml:",chardata"` +} + +type LinkT struct { + Type string `xml:"type,attr"` + Href string `xml:"href,attr"` +} + +var ( + x = util.CheckErr + w = util.WarnErr + // agent = "AhrefsBot/7.0" + agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36" +) + +func exists(filename string) bool { + _, err := os.Stat(filename) + return err == nil +} + +func fileDate(filename string) string { + b, err := os.ReadFile(filename) + if err != nil { + return "" + } + s := string(b) + i1 := strings.Index(s, "") + 10 + i2 := strings.Index(s, "") + return s[i1:i2] +} + +func main() { + resp, err := http.Get("https://www.vrt.be/vrtnws/nl.rss.headlines.xml") + x(err) + body, err := io.ReadAll(resp.Body) + x(err) + x(resp.Body.Close()) + + var rss Rss + x(xml.Unmarshal(body, &rss)) + + if len(rss.Items) == 0 { + x(fmt.Errorf("len(rss.Items) == 0")) + } + + for _, item := range rss.Items { + t, err := time.Parse(time.RFC3339Nano, item.Published) + if err != nil { + t, err = time.Parse(time.RFC1123, item.Published) + } + x(err) + t2, err := time.Parse(time.RFC3339Nano, item.Updated) + if err != nil { + t2, _ = time.Parse(time.RFC1123, item.Updated) + } + if t2.After(t) { + t = t2 + } + year, week := t.ISOWeek() + dirname := fmt.Sprintf("/net/corpora/nlnieuws/VRT/%d/%02d", year, week) + filename := dirname + "/" + url.PathEscape(strings.TrimPrefix(item.ID, "https://vrtnws.be/")) + + ts := fmt.Sprintf("%d", t.Unix()) + needUpdate := fileDate(filename+".xml") != ts + + x(os.MkdirAll(dirname, 0777)) + fp, err := os.Create(filename + ".xml") + x(err) + _, err = fp.WriteString("\n\n") + x(err) + _, err = fmt.Fprintf(fp, "%d", t.Unix()) + x(err) + _, err = fp.Write(item.Data) + x(err) + _, err = fp.WriteString("\n") + x(err) + x(fp.Close()) + x(os.Chtimes(filename+".xml", t, t)) + + var link string + for _, l := range item.Link { + if l.Type == "text/html" { + link = l.Href + } + } + if !doArticle(filename, link, item.Title.Text, item.Nstag, item.Nslabeltag, t, needUpdate) { + x(os.Remove(filename + ".xml")) + } + } +} + +func doArticle(filename string, url string, title string, tags []string, labels []string, timestamp time.Time, needUpdate bool) bool { + if needUpdate { + _ = os.Remove(filename + ".err") + _ = os.Remove(filename + ".txt") + _ = os.Remove(filename + ".html") + _ = os.Remove(filename + ".skip") + } else { + if exists(filename+".txt") || exists(filename+".skip") { + return true + } + } + time.Sleep(2 * time.Second) + + req, err := http.NewRequest("GET", url, nil) + x(err) + req.Header.Set("User-Agent", agent) + + client := &http.Client{} + resp, err := client.Do(req) + x(err) + body, err := io.ReadAll(resp.Body) + x(err) + x(resp.Body.Close()) + + /* + s := string(body) + ok := true + i1 := strings.Index(s, `type="application/ld+json"`) + if i1 < 0 { + ok = false + } else { + i1 += strings.Index(s[i1:], `>`) + 1 + i2 := i1 + strings.Index(s[i1:], ``) + if i2 < i1 { + ok = false + } else { + s = s[i1:i2] + } + } + if !ok { + _ = w(fmt.Errorf("script jsonld not found: %s", url)) + + fp, err := os.Create(filename + ".err") + x(err) + _, err = fmt.Fprintf(fp, "script jsonld not found: %s\n", url) + x(err) + x(fp.Close()) + x(os.Chtimes(filename+".err", timestamp, timestamp)) + + fp, err = os.Create(filename + ".html") + x(err) + _, err = fp.Write(body) + x(err) + x(fp.Close()) + x(os.Chtimes(filename+".html", timestamp, timestamp)) + + return false + } + fp, err := os.Create(filename + ".json") + x(err) + _, err = fp.WriteString(s) + x(err) + x(fp.Close()) + x(os.Chtimes(filename+".json", timestamp, timestamp)) + */ + + var buf bytes.Buffer + + doc, err := gokogiri.ParseHtml(body) + x(err) + + root := doc.Root() + + lnn, err := root.Search(`//head/link[@rel="canonical"]/@href`) + x(err) + for _, ln := range lnn { + if strings.Contains(ln.String(), "/liveblog/") { + fp, err := os.Create(filename + ".skip") + x(err) + _, err = fp.WriteString("liveblog\n") + x(err) + x(fp.Close()) + x(os.Chtimes(filename+".skip", timestamp, timestamp)) + return true + } + } + + if len(tags) == 0 { + _, err = fmt.Fprintln(&buf, "##META text cat =") + x(err) + } else { + for _, tag := range tags { + _, err = fmt.Fprintf(&buf, "##META text cat = %s\n", tag) + x(err) + } + } + if len(labels) == 0 { + _, err = fmt.Fprintln(&buf, "##META text label =") + x(err) + } else { + for _, label := range labels { + _, err = fmt.Fprintf(&buf, "##META text label = %s\n", label) + x(err) + } + } + + _, err = buf.WriteString(clean(title)) + x(err) + + fouten := make([]string, 0) + + found := false + pp, err := root.Search(`//div[@data-sentry-component="ArticleHeading"]//p[contains(@class,"prose-article-body-r")]`) + x(err) + for _, p := range pp { + _, err = fmt.Fprint(&buf, clean(p.Content())) + x(err) + found = true + } + if !found { + fouten = append(fouten, fmt.Sprintf("no heading: %s", url)) + _ = w(fmt.Errorf("no heading: %s", url)) + } + + found = false + pp, err = root.Search( + `//div[@data-sentry-component="ArticleText"]//p[contains(@class,"prose-article-body-r")]` + + ` | ` + + `//div[@data-sentry-component="ArticleTitle"]//h2`) + x(err) + for _, p := range pp { + _, err = fmt.Fprint(&buf, clean(p.Content())) + x(err) + found = true + } + if !found { + fouten = append(fouten, fmt.Sprintf("no text: %s", url)) + _ = w(fmt.Errorf("no text: %s", url)) + } + + if len(fouten) > 0 { + fp, err := os.Create(filename + ".err") + x(err) + for _, fout := range fouten { + _, err = fp.WriteString(fout) + x(err) + } + x(err) + x(fp.Close()) + x(os.Chtimes(filename+".err", timestamp, timestamp)) + + fp, err = os.Create(filename + ".html") + x(err) + _, err = fp.Write(body) + x(err) + x(fp.Close()) + x(os.Chtimes(filename+".html", timestamp, timestamp)) + + return false + } + + fp, err := os.Create(filename + ".txt") + x(err) + _, err = fp.Write(buf.Bytes()) + x(err) + x(fp.Close()) + x(os.Chtimes(filename+".txt", timestamp, timestamp)) + + return true +} + +func clean(s string) string { + s = strings.Join(strings.Fields(s), " ") + n := len(s) + if n == 0 { + return "" + } + if n > 0 { + if strings.ContainsAny(s[n-1:], ".!?") { + return s + "\n" + } + } + if n > 1 { + s2 := s[n-2:] + if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` { + return s + "\n" + } + } + return s + ".\n" +} diff --git a/VRT/txt2corpus.sh b/VRT/txt2corpus.sh new file mode 100755 index 0000000..ef99eea --- /dev/null +++ b/VRT/txt2corpus.sh @@ -0,0 +1,63 @@ +#!/bin/bash + +set -e + +unset CDPATH +PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH +export TZ=Europe/Amsterdam +. /net/aps/etc/alpino-activate.sh > /dev/null + +if [ "$1" = "" ] +then + ds=`ISODate -7` +else + case "$1" in + 2[0-9][0-9][0-9]-[0-5][0-9]) + ds=$1 + ;; + *) + echo INVALID + exit 1 + ;; + esac +fi + +dp=${ds//-//} + +corpus=/net/corpora/nlnieuws/VRT/corpus/$ds + +cd /net/corpora/nlnieuws/VRT/$dp + +ln -s lock.$$ lock +if [ "`readlink lock`" != lock.$$ ] +then + echo Getting lock failed + exit 1 +fi + +rm -fr out +mkdir out + +rm -f $corpus.lines +for i in *.txt +do + b=`basename $i .txt` + perl -p -e 's/^\s*//; s/^##META.*\n//' $i | tokenize.sh \ + | perl -e '$n = 0; while(<>) { $n++; print("vrt.'$b'.$n|$_"); }' \ + >> $corpus.lines +done + +cd out +mkdir xml +Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log + +../../../metadata 2> err +rm err + +cd xml +alto -o $corpus.data.dz *.xml 2> /dev/null + +cd ../.. +rm -fr out + +rm -f lock diff --git a/cmd/ISOWeek/ISOWeek.go b/cmd/ISOWeek/ISOWeek.go new file mode 100644 index 0000000..fe518bb --- /dev/null +++ b/cmd/ISOWeek/ISOWeek.go @@ -0,0 +1,48 @@ +package main + +/* + +Waarom? + +We willen year-week, bijvoorbeeld 2025-52 + +Als de datum 1 januari 2027 is, dan geeft dit: + + date +%Y-%V + +... dit: + + 2027-53 + +Dat is fout. Het moet zijn: + + 2026-53 + +Dit programma geeft wel de juiste uitvoer. + +*/ + +import ( + "github.com/pebbe/util" + + "fmt" + "os" + "strconv" + "time" +) + +var ( + x = util.CheckErr +) + +func main() { + // arg 1: aantal dagen opgeteld bij huidige datum + d, err := strconv.Atoi(os.Args[1]) + x(err) + + location, err := time.LoadLocation("Europe/Amsterdam") + x(err) + + year, week := time.Now().AddDate(0, 0, d).In(location).ISOWeek() + fmt.Printf("%d-%02d\n", year, week) +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..f106190 --- /dev/null +++ b/go.mod @@ -0,0 +1,8 @@ +module nlnieuws + +go 1.25.0 + +require ( + github.com/jbowtie/gokogiri v0.0.0-20250107075044-de0f9d4877a5 + github.com/pebbe/util v0.9.0 +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..779090e --- /dev/null +++ b/go.sum @@ -0,0 +1,4 @@ +github.com/jbowtie/gokogiri v0.0.0-20250107075044-de0f9d4877a5 h1:tQbR4RKFBFi0+Ll69dXejKKUbQVNaOAT2fjlDvSAfx4= +github.com/jbowtie/gokogiri v0.0.0-20250107075044-de0f9d4877a5/go.mod h1:kQE2lxPgVKe0JsBZMFFfMm5zBDCuRhaHFKOBzZeCLiw= +github.com/pebbe/util v0.9.0 h1:PMZd+CpWb8GbWEmFGlL3qd6XPuywl6xFIbrXWi870OA= +github.com/pebbe/util v0.9.0/go.mod h1:ynWl/SFX4+Seb9fpjVlYevr1f4TP7FrCmyZHiBCg69Q=