fix voor zinnen aan elkaar
This commit is contained in:
@@ -58,7 +58,7 @@ func main() {
|
|||||||
var item Item
|
var item Item
|
||||||
x(xml.Unmarshal(b, &item), filename)
|
x(xml.Unmarshal(b, &item), filename)
|
||||||
x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
|
x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
|
||||||
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
|
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + u.HtmlFixString(item.Text) + `</body></html>`))
|
||||||
x(err)
|
x(err)
|
||||||
root := doc.Root()
|
root := doc.Root()
|
||||||
pp, err := root.Search(`//body/p | //body/h2`)
|
pp, err := root.Search(`//body/p | //body/h2`)
|
||||||
|
|||||||
@@ -159,6 +159,8 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
|
|||||||
p(err)
|
p(err)
|
||||||
p(resp.Body.Close())
|
p(resp.Body.Close())
|
||||||
|
|
||||||
|
body = u.HtmlFix(body)
|
||||||
|
|
||||||
doc, err := gokogiri.ParseHtml(body)
|
doc, err := gokogiri.ParseHtml(body)
|
||||||
p(err)
|
p(err)
|
||||||
|
|
||||||
|
|||||||
@@ -158,6 +158,8 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
|
|||||||
p(err)
|
p(err)
|
||||||
p(resp.Body.Close())
|
p(resp.Body.Close())
|
||||||
|
|
||||||
|
body = u.HtmlFix(body)
|
||||||
|
|
||||||
doc, err := gokogiri.ParseHtml(body)
|
doc, err := gokogiri.ParseHtml(body)
|
||||||
p(err)
|
p(err)
|
||||||
|
|
||||||
|
|||||||
@@ -155,6 +155,8 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
|
|||||||
p(err)
|
p(err)
|
||||||
p(resp.Body.Close())
|
p(resp.Body.Close())
|
||||||
|
|
||||||
|
body = u.HtmlFix(body)
|
||||||
|
|
||||||
doc, err := gokogiri.ParseHtml(body)
|
doc, err := gokogiri.ParseHtml(body)
|
||||||
p(err)
|
p(err)
|
||||||
|
|
||||||
|
|||||||
@@ -168,6 +168,8 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
|
|||||||
p(err)
|
p(err)
|
||||||
p(resp.Body.Close())
|
p(resp.Body.Close())
|
||||||
|
|
||||||
|
body = u.HtmlFix(body)
|
||||||
|
|
||||||
/*
|
/*
|
||||||
s := string(body)
|
s := string(body)
|
||||||
ok = true
|
ok = true
|
||||||
|
|||||||
@@ -63,7 +63,7 @@ func main() {
|
|||||||
x(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(cat)))
|
x(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(cat)))
|
||||||
}
|
}
|
||||||
x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
|
x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
|
||||||
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
|
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + u.HtmlFixString(item.Text) + `</body></html>`))
|
||||||
x(err)
|
x(err)
|
||||||
root := doc.Root()
|
root := doc.Root()
|
||||||
pp, err := root.Search(`//body//p`)
|
pp, err := root.Search(`//body//p`)
|
||||||
|
|||||||
@@ -69,7 +69,9 @@ func main() {
|
|||||||
x(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(tag)))
|
x(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(tag)))
|
||||||
}
|
}
|
||||||
x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
|
x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
|
||||||
x(fp.WriteString(u.FixSpace(item.Text)))
|
for _, line := range strings.SplitAfter(item.Text, "\n") {
|
||||||
|
x(fp.WriteString(u.AddEnd(u.FixSpace(line, true))))
|
||||||
|
}
|
||||||
x(fp.Close())
|
x(fp.Close())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -161,6 +161,8 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
|
|||||||
p(err)
|
p(err)
|
||||||
p(resp.Body.Close())
|
p(resp.Body.Close())
|
||||||
|
|
||||||
|
body = u.HtmlFix(body)
|
||||||
|
|
||||||
s := string(body)
|
s := string(body)
|
||||||
ok := true
|
ok := true
|
||||||
i1 := strings.Index(s, `<script type="application/ld+json"`)
|
i1 := strings.Index(s, `<script type="application/ld+json"`)
|
||||||
@@ -226,22 +228,9 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
|
|||||||
// text bevat kopjes zonder punt aan het eind
|
// text bevat kopjes zonder punt aan het eind
|
||||||
lines := strings.Split(text, "\n")
|
lines := strings.Split(text, "\n")
|
||||||
for i, line := range lines {
|
for i, line := range lines {
|
||||||
line = u.FixSpace(line)
|
lines[i] = u.AddEnd(u.FixSpace(line, true))
|
||||||
n := len(line)
|
|
||||||
if n > 0 {
|
|
||||||
if strings.ContainsAny(line[n-1:], ".!?") {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if n > 1 {
|
|
||||||
s := line[n-2:]
|
|
||||||
if s == `."` || s == `!"` || s == `?"` {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
}
|
|
||||||
lines[i] = line + "."
|
|
||||||
}
|
}
|
||||||
text = strings.Join(lines, "\n") + "\n"
|
text = strings.Join(lines, "") + "\n"
|
||||||
|
|
||||||
fp, err := os.Create(filename + ".txt")
|
fp, err := os.Create(filename + ".txt")
|
||||||
p(err)
|
p(err)
|
||||||
|
|||||||
@@ -154,6 +154,8 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
|
|||||||
p(err)
|
p(err)
|
||||||
p(resp.Body.Close())
|
p(resp.Body.Close())
|
||||||
|
|
||||||
|
body = u.HtmlFix(body)
|
||||||
|
|
||||||
doc, err := gokogiri.ParseHtml(body)
|
doc, err := gokogiri.ParseHtml(body)
|
||||||
p(err)
|
p(err)
|
||||||
|
|
||||||
|
|||||||
@@ -65,7 +65,7 @@ func main() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
|
x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
|
||||||
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
|
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + u.HtmlFixString(item.Text) + `</body></html>`))
|
||||||
x(err)
|
x(err)
|
||||||
root := doc.Root()
|
root := doc.Root()
|
||||||
pp, err := root.Search(`//body/p`)
|
pp, err := root.Search(`//body/p`)
|
||||||
|
|||||||
@@ -164,6 +164,8 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
|
|||||||
p(err)
|
p(err)
|
||||||
p(resp.Body.Close())
|
p(resp.Body.Close())
|
||||||
|
|
||||||
|
body = u.HtmlFix(body)
|
||||||
|
|
||||||
doc, err := gokogiri.ParseHtml(body)
|
doc, err := gokogiri.ParseHtml(body)
|
||||||
p(err)
|
p(err)
|
||||||
|
|
||||||
|
|||||||
@@ -64,7 +64,7 @@ func main() {
|
|||||||
var item Item
|
var item Item
|
||||||
x(xml.Unmarshal(b, &item))
|
x(xml.Unmarshal(b, &item))
|
||||||
x(buf.WriteString(u.AddEnd(u.FixSpace(item.Title))))
|
x(buf.WriteString(u.AddEnd(u.FixSpace(item.Title))))
|
||||||
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
|
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + u.HtmlFixString(item.Text) + `</body></html>`))
|
||||||
x(err)
|
x(err)
|
||||||
root := doc.Root()
|
root := doc.Root()
|
||||||
divs, err := root.Search(`//div[@class="donatieformlinks"]`)
|
divs, err := root.Search(`//div[@class="donatieformlinks"]`)
|
||||||
|
|||||||
@@ -226,7 +226,7 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
|
|||||||
// text bevat kopjes zonder punt aan het eind
|
// text bevat kopjes zonder punt aan het eind
|
||||||
lines := strings.Split(doc.Text, "\n")
|
lines := strings.Split(doc.Text, "\n")
|
||||||
for i, line := range lines {
|
for i, line := range lines {
|
||||||
lines[i] = u.AddEnd(u.FixSpace(line))
|
lines[i] = u.AddEnd(u.FixSpace(line, true))
|
||||||
}
|
}
|
||||||
text := strings.Join(lines, "") + "\n"
|
text := strings.Join(lines, "") + "\n"
|
||||||
|
|
||||||
@@ -240,9 +240,7 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
|
|||||||
if strings.HasPrefix(t, "br_") {
|
if strings.HasPrefix(t, "br_") {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if strings.HasPrefix(t, "tr_") {
|
t = strings.TrimPrefix(t, "tr_")
|
||||||
t = t[3:]
|
|
||||||
}
|
|
||||||
p(fmt.Fprintf(fp, "##META text tag = %s\n", t))
|
p(fmt.Fprintf(fp, "##META text tag = %s\n", t))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -62,7 +62,7 @@ func main() {
|
|||||||
x(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(cat)))
|
x(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(cat)))
|
||||||
}
|
}
|
||||||
x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
|
x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
|
||||||
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
|
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + u.HtmlFixString(item.Text) + `</body></html>`))
|
||||||
x(err)
|
x(err)
|
||||||
root := doc.Root()
|
root := doc.Root()
|
||||||
pp, err := root.Search(`//body//p`)
|
pp, err := root.Search(`//body//p`)
|
||||||
|
|||||||
@@ -152,6 +152,8 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
|
|||||||
p(err)
|
p(err)
|
||||||
p(resp.Body.Close())
|
p(resp.Body.Close())
|
||||||
|
|
||||||
|
body = u.HtmlFix(body)
|
||||||
|
|
||||||
s := string(body)
|
s := string(body)
|
||||||
|
|
||||||
ok := true
|
ok := true
|
||||||
|
|||||||
@@ -66,7 +66,7 @@ func main() {
|
|||||||
x(fmt.Fprintf(fp, "##META text tag = %s\n", t))
|
x(fmt.Fprintf(fp, "##META text tag = %s\n", t))
|
||||||
}
|
}
|
||||||
x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
|
x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
|
||||||
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
|
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + u.HtmlFixString(item.Text) + `</body></html>`))
|
||||||
x(err)
|
x(err)
|
||||||
root := doc.Root()
|
root := doc.Root()
|
||||||
pp, err := root.Search(`//body/p`)
|
pp, err := root.Search(`//body/p`)
|
||||||
|
|||||||
@@ -179,6 +179,8 @@ func doArticle(filename string, url string, title string, tags []string, cats []
|
|||||||
p(err)
|
p(err)
|
||||||
p(resp.Body.Close())
|
p(resp.Body.Close())
|
||||||
|
|
||||||
|
body = u.HtmlFix(body)
|
||||||
|
|
||||||
/*
|
/*
|
||||||
s := string(body)
|
s := string(body)
|
||||||
ok := true
|
ok := true
|
||||||
|
|||||||
@@ -13,10 +13,19 @@ import (
|
|||||||
var (
|
var (
|
||||||
p = e.PanicErr
|
p = e.PanicErr
|
||||||
reEOL = regexp.MustCompile(`[.!?]['"”’]?$`)
|
reEOL = regexp.MustCompile(`[.!?]['"”’]?$`)
|
||||||
reNEOL = regexp.MustCompile(`[.!?]['"”’]?\p{Lu}\p{Ll}`)
|
reNEOL = regexp.MustCompile(`[.!?]['"”’]?\p{Lu}\p{Ll}+\.?`)
|
||||||
reLET = regexp.MustCompile(`\p{Lu}`)
|
reLET = regexp.MustCompile(`\p{Lu}`)
|
||||||
|
reBody = regexp.MustCompile(`<[bB][rR][ /]*>`)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
func HtmlFix(body []byte) []byte {
|
||||||
|
return reBody.ReplaceAllLiteral(body, []byte(" "))
|
||||||
|
}
|
||||||
|
|
||||||
|
func HtmlFixString(body string) string {
|
||||||
|
return reBody.ReplaceAllLiteralString(body, " ")
|
||||||
|
}
|
||||||
|
|
||||||
func AddEnd(s string) string {
|
func AddEnd(s string) string {
|
||||||
s = strings.TrimSpace(s)
|
s = strings.TrimSpace(s)
|
||||||
if s == "" {
|
if s == "" {
|
||||||
@@ -28,12 +37,20 @@ func AddEnd(s string) string {
|
|||||||
return s + ".\n"
|
return s + ".\n"
|
||||||
}
|
}
|
||||||
|
|
||||||
func FixSpace(s string) string {
|
func FixSpace(s string, opt ...bool) string {
|
||||||
s = strings.Join(strings.Fields(s), " ")
|
s = strings.Join(strings.Fields(s), " ")
|
||||||
s = reNEOL.ReplaceAllStringFunc(s, func(s1 string) string {
|
|
||||||
i := reLET.FindStringIndex(s1)[0]
|
if len(opt) > 0 && opt[0] {
|
||||||
return s1[:i] + " " + s1[i:]
|
s = reNEOL.ReplaceAllStringFunc(s, func(s1 string) string {
|
||||||
})
|
if strings.HasSuffix(s1, ".") {
|
||||||
|
// zoals: v.Chr.
|
||||||
|
return s1
|
||||||
|
}
|
||||||
|
i := reLET.FindStringIndex(s1)[0]
|
||||||
|
return s1[:i] + " " + s1[i:]
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
return s
|
return s
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user