123 lines
2.5 KiB
Go
123 lines
2.5 KiB
Go
package main
|
|
|
|
import (
|
|
e "codeberg.org/pebbe/errors"
|
|
"github.com/jbowtie/gokogiri"
|
|
"github.com/pebbe/textcat/v2"
|
|
|
|
"bytes"
|
|
"encoding/xml"
|
|
"fmt"
|
|
"os"
|
|
"regexp"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
type Item struct {
|
|
Title string `xml:"title"`
|
|
Text string `xml:"encoded"`
|
|
Cats []string `xml:"category"`
|
|
}
|
|
|
|
var (
|
|
x = e.ExitErr
|
|
w = e.WarnErr
|
|
|
|
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`)
|
|
)
|
|
|
|
func main() {
|
|
|
|
tc := textcat.NewTextCat()
|
|
tc.EnableLanguages("en.utf8", "nl.utf8")
|
|
|
|
var ds string
|
|
switch len(os.Args) {
|
|
case 1:
|
|
year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
|
|
ds = fmt.Sprintf("%d-%02d", year, week)
|
|
case 2:
|
|
if !reYearWeek.MatchString(os.Args[1]) {
|
|
x(fmt.Errorf("arg must be yyyy-ww"))
|
|
}
|
|
ds = os.Args[1]
|
|
default:
|
|
x(fmt.Errorf("too many arguments"))
|
|
}
|
|
dp := ds[:4] + "/" + ds[5:]
|
|
|
|
x(os.Chdir("/net/corpora/nlnieuws/RO/" + dp))
|
|
x(os.MkdirAll("out", 0777))
|
|
files, err := os.ReadDir(".")
|
|
x(err)
|
|
for _, file := range files {
|
|
filename := file.Name()
|
|
if !strings.HasSuffix(filename, ".xml") {
|
|
continue
|
|
}
|
|
b, err := os.ReadFile(filename)
|
|
x(err)
|
|
var buf bytes.Buffer
|
|
var item Item
|
|
x(xml.Unmarshal(b, &item))
|
|
x(buf.WriteString(addEnd(fixSpace(item.Title))))
|
|
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
|
|
x(err)
|
|
root := doc.Root()
|
|
divs, err := root.Search(`//div[@class="donatieformlinks"]`)
|
|
x(err)
|
|
for _, div := range divs {
|
|
div.Remove()
|
|
}
|
|
pp, err := root.Search(`//body//p[not(.//a[contains(@href,"reportersonline.nl/support")])]`)
|
|
x(err)
|
|
for _, p := range pp {
|
|
x(buf.WriteString(addEnd(fixSpace(p.Content()))))
|
|
}
|
|
|
|
text := buf.String()
|
|
langs, err := tc.Classify(text)
|
|
if err != nil {
|
|
_ = w(fmt.Errorf("language: %v in %s", err, filename))
|
|
continue
|
|
}
|
|
if len(langs) != 1 || langs[0] != "nl.utf8" {
|
|
_ = w(fmt.Errorf("language: %v in %s", langs, filename))
|
|
continue
|
|
}
|
|
|
|
fp, err := os.Create("out/" + filename[:len(filename)-4] + ".txt")
|
|
x(err)
|
|
for _, cat := range item.Cats {
|
|
x(fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(cat)))
|
|
}
|
|
x(fp.WriteString(text))
|
|
x(fp.Close())
|
|
}
|
|
}
|
|
|
|
func addEnd(s string) string {
|
|
s = strings.TrimSpace(s)
|
|
n := len(s)
|
|
if n == 0 {
|
|
return ""
|
|
}
|
|
if n > 0 {
|
|
if strings.ContainsAny(s[n-1:], ".!?") {
|
|
return s + "\n"
|
|
}
|
|
}
|
|
if n > 1 {
|
|
s2 := s[n-2:]
|
|
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
|
|
return s + "\n"
|
|
}
|
|
}
|
|
return s + ".\n"
|
|
}
|
|
|
|
func fixSpace(s string) string {
|
|
return strings.Join(strings.Fields(s), " ")
|
|
}
|