tags; .De -> . De
This commit is contained in:
@@ -59,7 +59,10 @@ func main() {
|
|||||||
var item Item
|
var item Item
|
||||||
x(xml.Unmarshal(b, &item))
|
x(xml.Unmarshal(b, &item))
|
||||||
for _, cat := range item.Cats {
|
for _, cat := range item.Cats {
|
||||||
x(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(cat)))
|
t := u.FixSpace(cat)
|
||||||
|
if t != "Nieuws" {
|
||||||
|
x(fmt.Fprintf(fp, "##META text tag = %s\n", t))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
|
x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
|
||||||
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
|
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
|
||||||
|
|||||||
@@ -286,7 +286,7 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
|
|||||||
}
|
}
|
||||||
for _, el := range ell {
|
for _, el := range ell {
|
||||||
s := strings.TrimSpace(el.Content())
|
s := strings.TrimSpace(el.Content())
|
||||||
if s != "" {
|
if s != "" && s != "Nieuws" {
|
||||||
tags = append(tags, s)
|
tags = append(tags, s)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
14
README.md
14
README.md
@@ -43,21 +43,22 @@ crontab van p209327@colossus
|
|||||||
|
|
||||||
Uitvoer in `[A-Z]*/corpus/`
|
Uitvoer in `[A-Z]*/corpus/`
|
||||||
|
|
||||||
NieuwsNL elke dag, de rest alleen op dinsdag
|
|
||||||
|
|
||||||
crontab van p209327@colossus
|
crontab van p209327@colossus
|
||||||
|
|
||||||
```
|
```
|
||||||
# m h dom mon dow command
|
# m h dom mon dow command
|
||||||
|
# veel data: elke dag
|
||||||
|
0 1 * * * /net/corpora/nlnieuws/HLN/txt2corpus.sh
|
||||||
|
0 1 * * * /net/corpora/nlnieuws/NOS/txt2corpus.sh
|
||||||
|
0 1 * * * /net/corpora/nlnieuws/NU/txt2corpus.sh
|
||||||
|
0 1 * * * /net/corpora/nlnieuws/NieuwsNL/txt2corpus.sh
|
||||||
|
0 1 * * * /net/corpora/nlnieuws/VRT/txt2corpus.sh
|
||||||
|
# weinig data: alleen op dinsdag
|
||||||
0 1 * * 2 /net/corpora/nlnieuws/AT5/txt2corpus.sh
|
0 1 * * 2 /net/corpora/nlnieuws/AT5/txt2corpus.sh
|
||||||
0 1 * * 2 /net/corpora/nlnieuws/BuurtAdam/txt2corpus.sh
|
0 1 * * 2 /net/corpora/nlnieuws/BuurtAdam/txt2corpus.sh
|
||||||
0 1 * * 2 /net/corpora/nlnieuws/BuurtGrn/txt2corpus.sh
|
0 1 * * 2 /net/corpora/nlnieuws/BuurtGrn/txt2corpus.sh
|
||||||
0 1 * * 2 /net/corpora/nlnieuws/GG/txt2corpus.sh
|
0 1 * * 2 /net/corpora/nlnieuws/GG/txt2corpus.sh
|
||||||
0 1 * * 2 /net/corpora/nlnieuws/HLN/txt2corpus.sh
|
|
||||||
0 1 * * 2 /net/corpora/nlnieuws/LitNL/txt2corpus.sh
|
0 1 * * 2 /net/corpora/nlnieuws/LitNL/txt2corpus.sh
|
||||||
0 1 * * * /net/corpora/nlnieuws/NieuwsNL/txt2corpus.sh
|
|
||||||
0 1 * * 2 /net/corpora/nlnieuws/NOS/txt2corpus.sh
|
|
||||||
0 1 * * 2 /net/corpora/nlnieuws/NU/txt2corpus.sh
|
|
||||||
0 1 * * 2 /net/corpora/nlnieuws/Oog/txt2corpus.sh
|
0 1 * * 2 /net/corpora/nlnieuws/Oog/txt2corpus.sh
|
||||||
0 1 * * 2 /net/corpora/nlnieuws/Parool/txt2corpus.sh
|
0 1 * * 2 /net/corpora/nlnieuws/Parool/txt2corpus.sh
|
||||||
0 1 * * 2 /net/corpora/nlnieuws/RO/txt2corpus.sh
|
0 1 * * 2 /net/corpora/nlnieuws/RO/txt2corpus.sh
|
||||||
@@ -65,7 +66,6 @@ crontab van p209327@colossus
|
|||||||
0 1 * * 2 /net/corpora/nlnieuws/Sargasso/txt2corpus.sh
|
0 1 * * 2 /net/corpora/nlnieuws/Sargasso/txt2corpus.sh
|
||||||
0 1 * * 2 /net/corpora/nlnieuws/Sikkom/txt2corpus.sh
|
0 1 * * 2 /net/corpora/nlnieuws/Sikkom/txt2corpus.sh
|
||||||
0 1 * * 2 /net/corpora/nlnieuws/Tzum/txt2corpus.sh
|
0 1 * * 2 /net/corpora/nlnieuws/Tzum/txt2corpus.sh
|
||||||
0 1 * * 2 /net/corpora/nlnieuws/VRT/txt2corpus.sh
|
|
||||||
```
|
```
|
||||||
|
|
||||||
## 3. Queries uitvoeren, tellingen doen
|
## 3. Queries uitvoeren, tellingen doen
|
||||||
|
|||||||
@@ -92,7 +92,10 @@ func main() {
|
|||||||
fp, err := os.Create("out/" + filename[:len(filename)-4] + ".txt")
|
fp, err := os.Create("out/" + filename[:len(filename)-4] + ".txt")
|
||||||
x(err)
|
x(err)
|
||||||
for _, cat := range item.Cats {
|
for _, cat := range item.Cats {
|
||||||
x(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(cat)))
|
t := u.FixSpace(cat)
|
||||||
|
if t != "Artikelen" && t != "cafeyn" {
|
||||||
|
x(fmt.Fprintf(fp, "##META text tag = %s\n", t))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
x(fp.WriteString(text))
|
x(fp.WriteString(text))
|
||||||
x(fp.Close())
|
x(fp.Close())
|
||||||
|
|||||||
@@ -236,7 +236,14 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
|
|||||||
p(fmt.Fprintln(fp, "##META text tag ="))
|
p(fmt.Fprintln(fp, "##META text tag ="))
|
||||||
} else {
|
} else {
|
||||||
for _, tag := range doc.Tags {
|
for _, tag := range doc.Tags {
|
||||||
p(fmt.Fprintf(fp, "##META text tag = %s\n", strings.ToLower(u.FixSpace(tag))))
|
t := strings.ToLower(u.FixSpace(tag))
|
||||||
|
if strings.HasPrefix(t, "br_") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if strings.HasPrefix(t, "tr_") {
|
||||||
|
t = t[3:]
|
||||||
|
}
|
||||||
|
p(fmt.Fprintf(fp, "##META text tag = %s\n", t))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if doc.Cat == "" {
|
if doc.Cat == "" {
|
||||||
|
|||||||
@@ -59,7 +59,11 @@ func main() {
|
|||||||
var item Item
|
var item Item
|
||||||
x(xml.Unmarshal(b, &item))
|
x(xml.Unmarshal(b, &item))
|
||||||
for _, cat := range item.Cats {
|
for _, cat := range item.Cats {
|
||||||
x(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(cat)))
|
t := u.FixSpace(cat)
|
||||||
|
if t == "Nieuws" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
x(fmt.Fprintf(fp, "##META text tag = %s\n", t))
|
||||||
}
|
}
|
||||||
x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
|
x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
|
||||||
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
|
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ import (
|
|||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
|
"regexp"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
@@ -49,13 +50,14 @@ var (
|
|||||||
parts = map[string]struct {
|
parts = map[string]struct {
|
||||||
file string
|
file string
|
||||||
suffix string
|
suffix string
|
||||||
|
re *regexp.Regexp
|
||||||
}{
|
}{
|
||||||
"nieuwe namen": {"nieuwe-namen", ".t20"},
|
"nieuwe namen": {"nieuwe-namen", ".t20", nil},
|
||||||
"nieuwe woorden": {"nieuwe-woorden-extra", ".t20"},
|
"nieuwe woorden": {"nieuwe-woorden-extra", ".t20", nil},
|
||||||
"personen": {"personen", ""},
|
"personen": {"personen", "", nil},
|
||||||
"andere namen": {"overige-namen", ""},
|
"andere namen": {"overige-namen", "", nil},
|
||||||
"locaties": {"locaties", ""},
|
"locaties": {"locaties", "", nil},
|
||||||
"organisaties": {"organisaties", ""},
|
"organisaties": {"organisaties", "", regexp.MustCompile(`^(ANP|AT5)`)},
|
||||||
}
|
}
|
||||||
|
|
||||||
maanden = strings.Fields("x januari februari maart april mei juni juli augustus september oktober november december")
|
maanden = strings.Fields("x januari februari maart april mei juni juli augustus september oktober november december")
|
||||||
@@ -142,12 +144,15 @@ func makeValues(source, part string) [][5]any {
|
|||||||
scanner := bufio.NewScanner(fp)
|
scanner := bufio.NewScanner(fp)
|
||||||
lineno := 0
|
lineno := 0
|
||||||
for scanner.Scan() {
|
for scanner.Scan() {
|
||||||
lineno++
|
|
||||||
line := scanner.Text()
|
line := scanner.Text()
|
||||||
aa := strings.Split(line, "\t")
|
aa := strings.Split(line, "\t")
|
||||||
count, err := strconv.Atoi(strings.TrimSpace(aa[0]))
|
count, err := strconv.Atoi(strings.TrimSpace(aa[0]))
|
||||||
x(err)
|
x(err)
|
||||||
word := aa[1]
|
word := aa[1]
|
||||||
|
if parts[part].re != nil && parts[part].re.MatchString(word) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
lineno++
|
||||||
var tags, lemma, postag string
|
var tags, lemma, postag string
|
||||||
if len(aa) > 2 {
|
if len(aa) > 2 {
|
||||||
tags = aa[2]
|
tags = aa[2]
|
||||||
|
|||||||
2
go.mod
2
go.mod
@@ -5,7 +5,9 @@ go 1.26.1
|
|||||||
require (
|
require (
|
||||||
codeberg.org/pebbe/errors v0.4.0
|
codeberg.org/pebbe/errors v0.4.0
|
||||||
github.com/jbowtie/gokogiri v0.0.0-20250107075044-de0f9d4877a5
|
github.com/jbowtie/gokogiri v0.0.0-20250107075044-de0f9d4877a5
|
||||||
|
github.com/pebbe/compactcorpus v1.0.3
|
||||||
github.com/pebbe/textcat/v2 v2.3.0
|
github.com/pebbe/textcat/v2 v2.3.0
|
||||||
|
github.com/rug-compling/alpinods v1.18.1
|
||||||
)
|
)
|
||||||
|
|
||||||
require github.com/pebbe/util v0.9.0 // indirect
|
require github.com/pebbe/util v0.9.0 // indirect
|
||||||
|
|||||||
4
go.sum
4
go.sum
@@ -2,7 +2,11 @@ codeberg.org/pebbe/errors v0.4.0 h1:G05wsXpC/LRPaL02QYDwtz0sWFWQcIWK1s+MC79LBzU=
|
|||||||
codeberg.org/pebbe/errors v0.4.0/go.mod h1:O7PPxUJM1bWRHq11CRK3wqVaH/3NnRaSVZvh3UhzDCY=
|
codeberg.org/pebbe/errors v0.4.0/go.mod h1:O7PPxUJM1bWRHq11CRK3wqVaH/3NnRaSVZvh3UhzDCY=
|
||||||
github.com/jbowtie/gokogiri v0.0.0-20250107075044-de0f9d4877a5 h1:tQbR4RKFBFi0+Ll69dXejKKUbQVNaOAT2fjlDvSAfx4=
|
github.com/jbowtie/gokogiri v0.0.0-20250107075044-de0f9d4877a5 h1:tQbR4RKFBFi0+Ll69dXejKKUbQVNaOAT2fjlDvSAfx4=
|
||||||
github.com/jbowtie/gokogiri v0.0.0-20250107075044-de0f9d4877a5/go.mod h1:kQE2lxPgVKe0JsBZMFFfMm5zBDCuRhaHFKOBzZeCLiw=
|
github.com/jbowtie/gokogiri v0.0.0-20250107075044-de0f9d4877a5/go.mod h1:kQE2lxPgVKe0JsBZMFFfMm5zBDCuRhaHFKOBzZeCLiw=
|
||||||
|
github.com/pebbe/compactcorpus v1.0.3 h1:6qlfXKHTKg7oWKLPCgEgv1scplfvphg/9l9XiRT2HzQ=
|
||||||
|
github.com/pebbe/compactcorpus v1.0.3/go.mod h1:SSpTeCZataCjjs82RJb8SOGdjkB3PlR7Z19EY4rInoQ=
|
||||||
github.com/pebbe/textcat/v2 v2.3.0 h1:RB2egIQgI2a2Ls+I9No6KFQKCZBIFt8Cc/SWCnVtC7Y=
|
github.com/pebbe/textcat/v2 v2.3.0 h1:RB2egIQgI2a2Ls+I9No6KFQKCZBIFt8Cc/SWCnVtC7Y=
|
||||||
github.com/pebbe/textcat/v2 v2.3.0/go.mod h1:WLXWuL+fOlQJqn6LmubjD+e78hCC6Y/rAWInh0wq/kg=
|
github.com/pebbe/textcat/v2 v2.3.0/go.mod h1:WLXWuL+fOlQJqn6LmubjD+e78hCC6Y/rAWInh0wq/kg=
|
||||||
github.com/pebbe/util v0.9.0 h1:PMZd+CpWb8GbWEmFGlL3qd6XPuywl6xFIbrXWi870OA=
|
github.com/pebbe/util v0.9.0 h1:PMZd+CpWb8GbWEmFGlL3qd6XPuywl6xFIbrXWi870OA=
|
||||||
github.com/pebbe/util v0.9.0/go.mod h1:ynWl/SFX4+Seb9fpjVlYevr1f4TP7FrCmyZHiBCg69Q=
|
github.com/pebbe/util v0.9.0/go.mod h1:ynWl/SFX4+Seb9fpjVlYevr1f4TP7FrCmyZHiBCg69Q=
|
||||||
|
github.com/rug-compling/alpinods v1.18.1 h1:BvPcCnNEQ1QoVSc0RmwJd3kZmvo4iqZ52/vFzVvFS7w=
|
||||||
|
github.com/rug-compling/alpinods v1.18.1/go.mod h1:R3BBX8RIw9InVqHZ+1W+MsX8WX8uBkoVNNGE38mqF1Q=
|
||||||
|
|||||||
@@ -13,6 +13,8 @@ import (
|
|||||||
var (
|
var (
|
||||||
p = e.PanicErr
|
p = e.PanicErr
|
||||||
reEOL = regexp.MustCompile(`[.!?]['"”’]?$`)
|
reEOL = regexp.MustCompile(`[.!?]['"”’]?$`)
|
||||||
|
reNEOL = regexp.MustCompile(`[.!?]['"”’]?\p{Lu}\p{Ll}`)
|
||||||
|
reLET = regexp.MustCompile(`\p{Lu}`)
|
||||||
)
|
)
|
||||||
|
|
||||||
func AddEnd(s string) string {
|
func AddEnd(s string) string {
|
||||||
@@ -27,7 +29,12 @@ func AddEnd(s string) string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func FixSpace(s string) string {
|
func FixSpace(s string) string {
|
||||||
return strings.Join(strings.Fields(s), " ")
|
s = strings.Join(strings.Fields(s), " ")
|
||||||
|
s = reNEOL.ReplaceAllStringFunc(s, func(s1 string) string {
|
||||||
|
i := reLET.FindStringIndex(s1)[0]
|
||||||
|
return s1[:i] + " " + s1[i:]
|
||||||
|
})
|
||||||
|
return s
|
||||||
}
|
}
|
||||||
|
|
||||||
func MkLock(filename string) {
|
func MkLock(filename string) {
|
||||||
|
|||||||
66
oud/fix.go
Normal file
66
oud/fix.go
Normal file
@@ -0,0 +1,66 @@
|
|||||||
|
package main
|
||||||
|
|
||||||
|
/*
|
||||||
|
Dit past corpora aan
|
||||||
|
|
||||||
|
Tags verwijderen:
|
||||||
|
|
||||||
|
Oog: Nieuws
|
||||||
|
Parool: Nieuws
|
||||||
|
RO: Artikelen, cafeyn
|
||||||
|
RTVNoord: br_*
|
||||||
|
Tzum: Nieuws
|
||||||
|
|
||||||
|
Tags veranderen:
|
||||||
|
|
||||||
|
RTVNoord: tr_* → *
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
import (
|
||||||
|
e "codeberg.org/pebbe/errors"
|
||||||
|
cc "github.com/pebbe/compactcorpus"
|
||||||
|
"github.com/rug-compling/alpinods"
|
||||||
|
|
||||||
|
"encoding/xml"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
x = e.ExitErr
|
||||||
|
)
|
||||||
|
|
||||||
|
func main() {
|
||||||
|
for _, file := range os.Args[1:] {
|
||||||
|
base := strings.TrimSuffix(file, ".data.dz")
|
||||||
|
newfile := base + "-new.data.dz"
|
||||||
|
|
||||||
|
incc, err := cc.Open(file)
|
||||||
|
x(err)
|
||||||
|
outcc, err := cc.NewCorpus(newfile)
|
||||||
|
x(err)
|
||||||
|
r, err := incc.NewRange()
|
||||||
|
x(err)
|
||||||
|
for r.HasNext() {
|
||||||
|
name, data := r.Next()
|
||||||
|
fmt.Printf("%s %s \r", base, name)
|
||||||
|
var alpino alpinods.AlpinoDS
|
||||||
|
x(xml.Unmarshal(data, &alpino))
|
||||||
|
for i := 0; i < len(alpino.Metadata.Meta); i++ {
|
||||||
|
if alpino.Metadata.Meta[i].Name != "tag" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if n := alpino.Metadata.Meta[i].Value; n == "Nieuws" || n == "Artikelen" || n == "cafeyn" || strings.HasPrefix(n, "br_") {
|
||||||
|
alpino.Metadata.Meta = append(alpino.Metadata.Meta[:i], alpino.Metadata.Meta[i+1:]...)
|
||||||
|
i--
|
||||||
|
} else if strings.HasPrefix(n, "tr_") {
|
||||||
|
alpino.Metadata.Meta[i].Value = n[3:]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
outcc.Write(name, []byte(alpino.String()))
|
||||||
|
}
|
||||||
|
x(outcc.Close())
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user