package html2text import ( "bytes" "regexp" "strconv" "strings" ) // Line break constants // Deprecated: Please use HTML2TextWithOptions(text, WithUnixLineBreak()) const ( WIN_LBR = "\r\n" UNIX_LBR = "\n" ) var legacyLBR = WIN_LBR var badTagnamesRE = regexp.MustCompile(`^(head|script|style|a)($|\s+)`) var linkTagRE = regexp.MustCompile(`^(?i:a)(?:$|\s).*(?i:href)\s*=\s*('([^']*?)'|"([^"]*?)"|([^\s"'` + "`" + `=<>]+))`) var badLinkHrefRE = regexp.MustCompile(`javascript:`) var headersRE = regexp.MustCompile(`^(\/)?h[1-6]`) var numericEntityRE = regexp.MustCompile(`(?i)^#(x?[a-f0-9]+)$`) type options struct { lbr string linksInnerText bool listPrefix string } func newOptions() *options { // apply defaults return &options{ lbr: WIN_LBR, } } // Option is a functional option type Option func(*options) // WithUnixLineBreaks instructs the converter to use unix line breaks ("\n" instead of "\r\n" default) func WithUnixLineBreaks() Option { return func(o *options) { o.lbr = UNIX_LBR } } // WithLinksInnerText instructs the converter to retain link tag inner text and append href URLs in angle brackets after the text // Example: click news <http://bit.ly/2n4wXRs> func WithLinksInnerText() Option { return func(o *options) { o.linksInnerText = true } } // WithListSupportPrefix formats <ul> and <li> lists with the specified prefix func WithListSupportPrefix(prefix string) Option { return func(o *options) { o.listPrefix = prefix } } // WithListSupport formats <ul> and <li> lists with " - " prefix func WithListSupport() Option { return WithListSupportPrefix(" - ") } func parseHTMLEntity(entName string) (string, bool) { if r, ok := entity[entName]; ok { return string(r), true } if match := numericEntityRE.FindStringSubmatch(entName); len(match) == 2 { var ( err error n int64 digits = match[1] ) if digits != "" && (digits[0] == 'x' || digits[0] == 'X') { n, err = strconv.ParseInt(digits[1:], 16, 64) } else { n, err = strconv.ParseInt(digits, 10, 64) } if err == nil && (n == 9 || n == 10 || n == 13 || n > 31) { return string(rune(n)), true } } return "", false } // SetUnixLbr with argument true sets Unix-style line-breaks in output ("\n") // with argument false sets Windows-style line-breaks in output ("\r\n", the default) // Deprecated: Please use HTML2TextWithOptions(text, WithUnixLineBreak()) func SetUnixLbr(b bool) { if b { legacyLBR = UNIX_LBR } else { legacyLBR = WIN_LBR } } // HTMLEntitiesToText decodes HTML entities inside a provided // string and returns decoded text func HTMLEntitiesToText(htmlEntsText string) string { outBuf := bytes.NewBufferString("") inEnt := false for i, r := range htmlEntsText { switch { case r == ';' && inEnt: inEnt = false continue case r == '&': //possible html entity entName := "" isEnt := false // parse the entity name - max 10 chars chars := 0 for _, er := range htmlEntsText[i+1:] { if er == ';' { isEnt = true break } else { entName += string(er) } chars++ if chars == 10 { break } } if isEnt { if ent, isEnt := parseHTMLEntity(entName); isEnt { outBuf.WriteString(ent) inEnt = true continue } } } if !inEnt { outBuf.WriteRune(r) } } return outBuf.String() } func writeSpace(outBuf *bytes.Buffer) { bts := outBuf.Bytes() if len(bts) > 0 && bts[len(bts)-1] != ' ' { outBuf.WriteString(" ") } } // HTML2Text converts html into a text form func HTML2Text(html string) string { var opts []Option if legacyLBR == UNIX_LBR { opts = append(opts, WithUnixLineBreaks()) } return HTML2TextWithOptions(html, opts...) } // HTML2TextWithOptions converts html into a text form with additional options func HTML2TextWithOptions(html string, reqOpts ...Option) string { opts := newOptions() for _, opt := range reqOpts { opt(opts) } inLen := len(html) tagStart := 0 inEnt := false badTagStackDepth := 0 // if == 1 it means we are inside <head>...</head> shouldOutput := true // maintain a stack of <a> tag href links and output it after the tag's inner text (for opts.linksInnerText only) hrefs := []string{} // new line cannot be printed at the beginning or // for <p> after a new line created by previous <p></p> canPrintNewline := false outBuf := bytes.NewBufferString("") for i, r := range html { if inLen > 0 && i == inLen-1 { // prevent new line at the end of the document canPrintNewline = false } switch { // skip new lines and spaces adding a single space if not there yet case r <= 0xD, r == 0x85, r == 0x2028, r == 0x2029, // new lines r == ' ', r >= 0x2008 && r <= 0x200B: // spaces if shouldOutput && badTagStackDepth == 0 && !inEnt { //outBuf.WriteString(fmt.Sprintf("{DBG r:%c, inEnt:%t, tag:%s}", r, inEnt, html[tagStart:i])) writeSpace(outBuf) } continue case r == ';' && inEnt: // end of html entity inEnt = false continue case r == '&' && shouldOutput: // possible html entity entName := "" isEnt := false // parse the entity name - max 10 chars chars := 0 for _, er := range html[i+1:] { if er == ';' { isEnt = true break } else { entName += string(er) } chars++ if chars == 10 { break } } if isEnt { if ent, isEnt := parseHTMLEntity(entName); isEnt { outBuf.WriteString(ent) inEnt = true continue } } case r == '<': // start of a tag tagStart = i + 1 shouldOutput = false continue case r == '>': // end of a tag shouldOutput = true tag := html[tagStart:i] tagNameLowercase := strings.ToLower(tag) if tagNameLowercase == "/ul" || tagNameLowercase == "/ol" { outBuf.WriteString(opts.lbr) } else if tagNameLowercase == "li" || tagNameLowercase == "li/" { if opts.listPrefix != "" { outBuf.WriteString(opts.lbr + opts.listPrefix) } else { outBuf.WriteString(opts.lbr) } } else if headersRE.MatchString(tagNameLowercase) { if canPrintNewline { outBuf.WriteString(opts.lbr + opts.lbr) } canPrintNewline = false } else if tagNameLowercase == "br" || tagNameLowercase == "br/" { // new line outBuf.WriteString(opts.lbr) } else if tagNameLowercase == "p" || tagNameLowercase == "/p" { if canPrintNewline { outBuf.WriteString(opts.lbr + opts.lbr) } canPrintNewline = false } else if opts.linksInnerText && tagNameLowercase == "/a" { // end of link // links can be empty can happen if the link matches the badLinkHrefRE if len(hrefs) > 0 { outBuf.WriteString(" <") outBuf.WriteString(HTMLEntitiesToText(hrefs[0])) outBuf.WriteString(">") hrefs = hrefs[1:] } } else if opts.linksInnerText && linkTagRE.MatchString(tagNameLowercase) { // parse link href // add special handling for a tags m := linkTagRE.FindStringSubmatch(tag) if len(m) == 5 { link := m[2] if len(link) == 0 { link = m[3] if len(link) == 0 { link = m[4] } } if opts.linksInnerText && !badLinkHrefRE.MatchString(link) { hrefs = append(hrefs, link) } } } else if badTagnamesRE.MatchString(tagNameLowercase) { // unwanted block badTagStackDepth++ // if link inner text preservation is not enabled // and the current tag is a link tag, parse its href and output that if !opts.linksInnerText { // parse link href m := linkTagRE.FindStringSubmatch(tag) if len(m) == 5 { link := m[2] if len(link) == 0 { link = m[3] if len(link) == 0 { link = m[4] } } if !badLinkHrefRE.MatchString(link) { outBuf.WriteString(HTMLEntitiesToText(link)) } } } } else if len(tagNameLowercase) > 0 && tagNameLowercase[0] == '/' && badTagnamesRE.MatchString(tagNameLowercase[1:]) { // end of unwanted block badTagStackDepth-- } continue } // switch end if shouldOutput && badTagStackDepth == 0 && !inEnt { canPrintNewline = true outBuf.WriteRune(r) } } return outBuf.String() }