fix voor zinnen aan elkaar

This commit is contained in:
Peter Kleiweg
2026-05-29 17:22:10 +02:00
parent ca4e7af8fa
commit 14590570ba
18 changed files with 54 additions and 32 deletions

View File

@@ -58,7 +58,7 @@ func main() {
var item Item var item Item
x(xml.Unmarshal(b, &item), filename) x(xml.Unmarshal(b, &item), filename)
x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title)))) x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`)) doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + u.HtmlFixString(item.Text) + `</body></html>`))
x(err) x(err)
root := doc.Root() root := doc.Root()
pp, err := root.Search(`//body/p | //body/h2`) pp, err := root.Search(`//body/p | //body/h2`)

View File

@@ -159,6 +159,8 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
p(err) p(err)
p(resp.Body.Close()) p(resp.Body.Close())
body = u.HtmlFix(body)
doc, err := gokogiri.ParseHtml(body) doc, err := gokogiri.ParseHtml(body)
p(err) p(err)

View File

@@ -158,6 +158,8 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
p(err) p(err)
p(resp.Body.Close()) p(resp.Body.Close())
body = u.HtmlFix(body)
doc, err := gokogiri.ParseHtml(body) doc, err := gokogiri.ParseHtml(body)
p(err) p(err)

View File

@@ -155,6 +155,8 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
p(err) p(err)
p(resp.Body.Close()) p(resp.Body.Close())
body = u.HtmlFix(body)
doc, err := gokogiri.ParseHtml(body) doc, err := gokogiri.ParseHtml(body)
p(err) p(err)

View File

@@ -168,6 +168,8 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
p(err) p(err)
p(resp.Body.Close()) p(resp.Body.Close())
body = u.HtmlFix(body)
/* /*
s := string(body) s := string(body)
ok = true ok = true

View File

@@ -63,7 +63,7 @@ func main() {
x(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(cat))) x(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(cat)))
} }
x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title)))) x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`)) doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + u.HtmlFixString(item.Text) + `</body></html>`))
x(err) x(err)
root := doc.Root() root := doc.Root()
pp, err := root.Search(`//body//p`) pp, err := root.Search(`//body//p`)

View File

@@ -69,7 +69,9 @@ func main() {
x(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(tag))) x(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(tag)))
} }
x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title)))) x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
x(fp.WriteString(u.FixSpace(item.Text))) for _, line := range strings.SplitAfter(item.Text, "\n") {
x(fp.WriteString(u.AddEnd(u.FixSpace(line, true))))
}
x(fp.Close()) x(fp.Close())
} }
} }

View File

@@ -161,6 +161,8 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
p(err) p(err)
p(resp.Body.Close()) p(resp.Body.Close())
body = u.HtmlFix(body)
s := string(body) s := string(body)
ok := true ok := true
i1 := strings.Index(s, `<script type="application/ld+json"`) i1 := strings.Index(s, `<script type="application/ld+json"`)
@@ -226,22 +228,9 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
// text bevat kopjes zonder punt aan het eind // text bevat kopjes zonder punt aan het eind
lines := strings.Split(text, "\n") lines := strings.Split(text, "\n")
for i, line := range lines { for i, line := range lines {
line = u.FixSpace(line) lines[i] = u.AddEnd(u.FixSpace(line, true))
n := len(line)
if n > 0 {
if strings.ContainsAny(line[n-1:], ".!?") {
continue
}
}
if n > 1 {
s := line[n-2:]
if s == `."` || s == `!"` || s == `?"` {
continue
}
}
lines[i] = line + "."
} }
text = strings.Join(lines, "\n") + "\n" text = strings.Join(lines, "") + "\n"
fp, err := os.Create(filename + ".txt") fp, err := os.Create(filename + ".txt")
p(err) p(err)

View File

@@ -154,6 +154,8 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
p(err) p(err)
p(resp.Body.Close()) p(resp.Body.Close())
body = u.HtmlFix(body)
doc, err := gokogiri.ParseHtml(body) doc, err := gokogiri.ParseHtml(body)
p(err) p(err)

View File

@@ -65,7 +65,7 @@ func main() {
} }
} }
x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title)))) x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`)) doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + u.HtmlFixString(item.Text) + `</body></html>`))
x(err) x(err)
root := doc.Root() root := doc.Root()
pp, err := root.Search(`//body/p`) pp, err := root.Search(`//body/p`)

View File

@@ -164,6 +164,8 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
p(err) p(err)
p(resp.Body.Close()) p(resp.Body.Close())
body = u.HtmlFix(body)
doc, err := gokogiri.ParseHtml(body) doc, err := gokogiri.ParseHtml(body)
p(err) p(err)

View File

@@ -64,7 +64,7 @@ func main() {
var item Item var item Item
x(xml.Unmarshal(b, &item)) x(xml.Unmarshal(b, &item))
x(buf.WriteString(u.AddEnd(u.FixSpace(item.Title)))) x(buf.WriteString(u.AddEnd(u.FixSpace(item.Title))))
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`)) doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + u.HtmlFixString(item.Text) + `</body></html>`))
x(err) x(err)
root := doc.Root() root := doc.Root()
divs, err := root.Search(`//div[@class="donatieformlinks"]`) divs, err := root.Search(`//div[@class="donatieformlinks"]`)

View File

@@ -226,7 +226,7 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
// text bevat kopjes zonder punt aan het eind // text bevat kopjes zonder punt aan het eind
lines := strings.Split(doc.Text, "\n") lines := strings.Split(doc.Text, "\n")
for i, line := range lines { for i, line := range lines {
lines[i] = u.AddEnd(u.FixSpace(line)) lines[i] = u.AddEnd(u.FixSpace(line, true))
} }
text := strings.Join(lines, "") + "\n" text := strings.Join(lines, "") + "\n"
@@ -240,9 +240,7 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
if strings.HasPrefix(t, "br_") { if strings.HasPrefix(t, "br_") {
continue continue
} }
if strings.HasPrefix(t, "tr_") { t = strings.TrimPrefix(t, "tr_")
t = t[3:]
}
p(fmt.Fprintf(fp, "##META text tag = %s\n", t)) p(fmt.Fprintf(fp, "##META text tag = %s\n", t))
} }
} }

View File

@@ -62,7 +62,7 @@ func main() {
x(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(cat))) x(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(cat)))
} }
x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title)))) x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`)) doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + u.HtmlFixString(item.Text) + `</body></html>`))
x(err) x(err)
root := doc.Root() root := doc.Root()
pp, err := root.Search(`//body//p`) pp, err := root.Search(`//body//p`)

View File

@@ -152,6 +152,8 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
p(err) p(err)
p(resp.Body.Close()) p(resp.Body.Close())
body = u.HtmlFix(body)
s := string(body) s := string(body)
ok := true ok := true

View File

@@ -66,7 +66,7 @@ func main() {
x(fmt.Fprintf(fp, "##META text tag = %s\n", t)) x(fmt.Fprintf(fp, "##META text tag = %s\n", t))
} }
x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title)))) x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`)) doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + u.HtmlFixString(item.Text) + `</body></html>`))
x(err) x(err)
root := doc.Root() root := doc.Root()
pp, err := root.Search(`//body/p`) pp, err := root.Search(`//body/p`)

View File

@@ -179,6 +179,8 @@ func doArticle(filename string, url string, title string, tags []string, cats []
p(err) p(err)
p(resp.Body.Close()) p(resp.Body.Close())
body = u.HtmlFix(body)
/* /*
s := string(body) s := string(body)
ok := true ok := true

View File

@@ -13,10 +13,19 @@ import (
var ( var (
p = e.PanicErr p = e.PanicErr
reEOL = regexp.MustCompile(`[.!?]['"”’]?$`) reEOL = regexp.MustCompile(`[.!?]['"”’]?$`)
reNEOL = regexp.MustCompile(`[.!?]['"”’]?\p{Lu}\p{Ll}`) reNEOL = regexp.MustCompile(`[.!?]['"”’]?\p{Lu}\p{Ll}+\.?`)
reLET = regexp.MustCompile(`\p{Lu}`) reLET = regexp.MustCompile(`\p{Lu}`)
reBody = regexp.MustCompile(`<[bB][rR][ /]*>`)
) )
func HtmlFix(body []byte) []byte {
return reBody.ReplaceAllLiteral(body, []byte(" "))
}
func HtmlFixString(body string) string {
return reBody.ReplaceAllLiteralString(body, " ")
}
func AddEnd(s string) string { func AddEnd(s string) string {
s = strings.TrimSpace(s) s = strings.TrimSpace(s)
if s == "" { if s == "" {
@@ -28,12 +37,20 @@ func AddEnd(s string) string {
return s + ".\n" return s + ".\n"
} }
func FixSpace(s string) string { func FixSpace(s string, opt ...bool) string {
s = strings.Join(strings.Fields(s), " ") s = strings.Join(strings.Fields(s), " ")
s = reNEOL.ReplaceAllStringFunc(s, func(s1 string) string {
i := reLET.FindStringIndex(s1)[0] if len(opt) > 0 && opt[0] {
return s1[:i] + " " + s1[i:] s = reNEOL.ReplaceAllStringFunc(s, func(s1 string) string {
}) if strings.HasSuffix(s1, ".") {
// zoals: v.Chr.
return s1
}
i := reLET.FindStringIndex(s1)[0]
return s1[:i] + " " + s1[i:]
})
}
return s return s
} }