diff --git a/Tzum/cmd/xml2txt/xml2txt.go b/Tzum/cmd/xml2txt/xml2txt.go index d8e8b47..3d8e7c6 100644 --- a/Tzum/cmd/xml2txt/xml2txt.go +++ b/Tzum/cmd/xml2txt/xml2txt.go @@ -24,6 +24,8 @@ var ( x = e.ExitErr reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]\.[0-5][0-9]$`) + reEM = regexp.MustCompile(`::EM::.*?::/EM::`) + reTitle = regexp.MustCompile(`^\p{Lu}`) ) func main() { @@ -66,7 +68,7 @@ func main() { x(fmt.Fprintf(fp, "##META text tag = %s\n", t)) } x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title)))) - doc, err := gokogiri.ParseHtml([]byte(`` + u.HtmlFixString(item.Text) + ``)) + doc, err := gokogiri.ParseHtml([]byte(`` + em1(u.HtmlFixString(item.Text)) + ``)) x(err) root := doc.Root() pp, err := root.Search(`//body/p`) @@ -74,9 +76,26 @@ func main() { for _, p := range pp { s := p.Content() if !strings.Contains(s, "verscheen eerst op Tzum.") { - x(fp.WriteString(u.AddEnd(u.FixSpace(p.Content())))) + x(fp.WriteString(em2(u.AddEnd(u.FixSpace(p.Content()))))) } } x(fp.Close()) } } + +func em1(s string) string { + return strings.ReplaceAll( + strings.ReplaceAll(s, "", " ::EM::"), + "", + "::/EM:: ") +} + +func em2(s string) string { + return reEM.ReplaceAllStringFunc(s, func(s1 string) string { + s1 = s1[6 : len(s1)-7] + if reTitle.MatchString(s1) { + return `"` + s1 + `"` + } + return s1 + }) +}