| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535 | 
							- // Copyright (c) 2014, David Kitchen <david@buro9.com>
 
- //
 
- // All rights reserved.
 
- //
 
- // Redistribution and use in source and binary forms, with or without
 
- // modification, are permitted provided that the following conditions are met:
 
- //
 
- // * Redistributions of source code must retain the above copyright notice, this
 
- //   list of conditions and the following disclaimer.
 
- //
 
- // * Redistributions in binary form must reproduce the above copyright notice,
 
- //   this list of conditions and the following disclaimer in the documentation
 
- //   and/or other materials provided with the distribution.
 
- //
 
- // * Neither the name of the organisation (Microcosm) nor the names of its
 
- //   contributors may be used to endorse or promote products derived from
 
- //   this software without specific prior written permission.
 
- //
 
- // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 
- // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 
- // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 
- // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 
- // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 
- // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 
- // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 
- // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 
- // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 
- // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
- package bluemonday
 
- import (
 
- 	"bytes"
 
- 	"io"
 
- 	"net/url"
 
- 	"strings"
 
- 	"golang.org/x/net/html"
 
- )
 
- // Sanitize takes a string that contains a HTML fragment or document and applies
 
- // the given policy whitelist.
 
- //
 
- // It returns a HTML string that has been sanitized by the policy or an empty
 
- // string if an error has occurred (most likely as a consequence of extremely
 
- // malformed input)
 
- func (p *Policy) Sanitize(s string) string {
 
- 	if strings.TrimSpace(s) == "" {
 
- 		return s
 
- 	}
 
- 	return p.sanitize(strings.NewReader(s)).String()
 
- }
 
- // SanitizeBytes takes a []byte that contains a HTML fragment or document and applies
 
- // the given policy whitelist.
 
- //
 
- // It returns a []byte containing the HTML that has been sanitized by the policy
 
- // or an empty []byte if an error has occurred (most likely as a consequence of
 
- // extremely malformed input)
 
- func (p *Policy) SanitizeBytes(b []byte) []byte {
 
- 	if len(bytes.TrimSpace(b)) == 0 {
 
- 		return b
 
- 	}
 
- 	return p.sanitize(bytes.NewReader(b)).Bytes()
 
- }
 
- // SanitizeReader takes an io.Reader that contains a HTML fragment or document
 
- // and applies the given policy whitelist.
 
- //
 
- // It returns a bytes.Buffer containing the HTML that has been sanitized by the
 
- // policy. Errors during sanitization will merely return an empty result.
 
- func (p *Policy) SanitizeReader(r io.Reader) *bytes.Buffer {
 
- 	return p.sanitize(r)
 
- }
 
- // Performs the actual sanitization process.
 
- func (p *Policy) sanitize(r io.Reader) *bytes.Buffer {
 
- 	// It is possible that the developer has created the policy via:
 
- 	//   p := bluemonday.Policy{}
 
- 	// rather than:
 
- 	//   p := bluemonday.NewPolicy()
 
- 	// If this is the case, and if they haven't yet triggered an action that
 
- 	// would initiliaze the maps, then we need to do that.
 
- 	p.init()
 
- 	var (
 
- 		buff                     bytes.Buffer
 
- 		skipElementContent       bool
 
- 		skippingElementsCount    int64
 
- 		skipClosingTag           bool
 
- 		closingTagToSkipStack    []string
 
- 		mostRecentlyStartedToken string
 
- 	)
 
- 	tokenizer := html.NewTokenizer(r)
 
- 	for {
 
- 		if tokenizer.Next() == html.ErrorToken {
 
- 			err := tokenizer.Err()
 
- 			if err == io.EOF {
 
- 				// End of input means end of processing
 
- 				return &buff
 
- 			}
 
- 			// Raw tokenizer error
 
- 			return &bytes.Buffer{}
 
- 		}
 
- 		token := tokenizer.Token()
 
- 		switch token.Type {
 
- 		case html.DoctypeToken:
 
- 			if p.allowDocType {
 
- 				buff.WriteString(token.String())
 
- 			}
 
- 		case html.CommentToken:
 
- 			// Comments are ignored by default
 
- 		case html.StartTagToken:
 
- 			mostRecentlyStartedToken = token.Data
 
- 			aps, ok := p.elsAndAttrs[token.Data]
 
- 			if !ok {
 
- 				if _, ok := p.setOfElementsToSkipContent[token.Data]; ok {
 
- 					skipElementContent = true
 
- 					skippingElementsCount++
 
- 				}
 
- 				if p.addSpaces {
 
- 					buff.WriteString(" ")
 
- 				}
 
- 				break
 
- 			}
 
- 			if len(token.Attr) != 0 {
 
- 				token.Attr = p.sanitizeAttrs(token.Data, token.Attr, aps)
 
- 			}
 
- 			if len(token.Attr) == 0 {
 
- 				if !p.allowNoAttrs(token.Data) {
 
- 					skipClosingTag = true
 
- 					closingTagToSkipStack = append(closingTagToSkipStack, token.Data)
 
- 					if p.addSpaces {
 
- 						buff.WriteString(" ")
 
- 					}
 
- 					break
 
- 				}
 
- 			}
 
- 			if !skipElementContent {
 
- 				buff.WriteString(token.String())
 
- 			}
 
- 		case html.EndTagToken:
 
- 			if skipClosingTag && closingTagToSkipStack[len(closingTagToSkipStack)-1] == token.Data {
 
- 				closingTagToSkipStack = closingTagToSkipStack[:len(closingTagToSkipStack)-1]
 
- 				if len(closingTagToSkipStack) == 0 {
 
- 					skipClosingTag = false
 
- 				}
 
- 				if p.addSpaces {
 
- 					buff.WriteString(" ")
 
- 				}
 
- 				break
 
- 			}
 
- 			if _, ok := p.elsAndAttrs[token.Data]; !ok {
 
- 				if _, ok := p.setOfElementsToSkipContent[token.Data]; ok {
 
- 					skippingElementsCount--
 
- 					if skippingElementsCount == 0 {
 
- 						skipElementContent = false
 
- 					}
 
- 				}
 
- 				if p.addSpaces {
 
- 					buff.WriteString(" ")
 
- 				}
 
- 				break
 
- 			}
 
- 			if !skipElementContent {
 
- 				buff.WriteString(token.String())
 
- 			}
 
- 		case html.SelfClosingTagToken:
 
- 			aps, ok := p.elsAndAttrs[token.Data]
 
- 			if !ok {
 
- 				if p.addSpaces {
 
- 					buff.WriteString(" ")
 
- 				}
 
- 				break
 
- 			}
 
- 			if len(token.Attr) != 0 {
 
- 				token.Attr = p.sanitizeAttrs(token.Data, token.Attr, aps)
 
- 			}
 
- 			if len(token.Attr) == 0 && !p.allowNoAttrs(token.Data) {
 
- 				if p.addSpaces {
 
- 					buff.WriteString(" ")
 
- 				}
 
- 				break
 
- 			}
 
- 			if !skipElementContent {
 
- 				buff.WriteString(token.String())
 
- 			}
 
- 		case html.TextToken:
 
- 			if !skipElementContent {
 
- 				switch strings.ToLower(mostRecentlyStartedToken) {
 
- 				case "javascript":
 
- 					// not encouraged, but if a policy allows JavaScript we
 
- 					// should not HTML escape it as that would break the output
 
- 					buff.WriteString(token.Data)
 
- 				case "style":
 
- 					// not encouraged, but if a policy allows CSS styles we
 
- 					// should not HTML escape it as that would break the output
 
- 					buff.WriteString(token.Data)
 
- 				default:
 
- 					// HTML escape the text
 
- 					buff.WriteString(token.String())
 
- 				}
 
- 			}
 
- 		default:
 
- 			// A token that didn't exist in the html package when we wrote this
 
- 			return &bytes.Buffer{}
 
- 		}
 
- 	}
 
- }
 
- // sanitizeAttrs takes a set of element attribute policies and the global
 
- // attribute policies and applies them to the []html.Attribute returning a set
 
- // of html.Attributes that match the policies
 
- func (p *Policy) sanitizeAttrs(
 
- 	elementName string,
 
- 	attrs []html.Attribute,
 
- 	aps map[string]attrPolicy,
 
- ) []html.Attribute {
 
- 	if len(attrs) == 0 {
 
- 		return attrs
 
- 	}
 
- 	// Builds a new attribute slice based on the whether the attribute has been
 
- 	// whitelisted explicitly or globally.
 
- 	cleanAttrs := []html.Attribute{}
 
- 	for _, htmlAttr := range attrs {
 
- 		// Is there an element specific attribute policy that applies?
 
- 		if ap, ok := aps[htmlAttr.Key]; ok {
 
- 			if ap.regexp != nil {
 
- 				if ap.regexp.MatchString(htmlAttr.Val) {
 
- 					cleanAttrs = append(cleanAttrs, htmlAttr)
 
- 					continue
 
- 				}
 
- 			} else {
 
- 				cleanAttrs = append(cleanAttrs, htmlAttr)
 
- 				continue
 
- 			}
 
- 		}
 
- 		// Is there a global attribute policy that applies?
 
- 		if ap, ok := p.globalAttrs[htmlAttr.Key]; ok {
 
- 			if ap.regexp != nil {
 
- 				if ap.regexp.MatchString(htmlAttr.Val) {
 
- 					cleanAttrs = append(cleanAttrs, htmlAttr)
 
- 				}
 
- 			} else {
 
- 				cleanAttrs = append(cleanAttrs, htmlAttr)
 
- 			}
 
- 		}
 
- 	}
 
- 	if len(cleanAttrs) == 0 {
 
- 		// If nothing was allowed, let's get out of here
 
- 		return cleanAttrs
 
- 	}
 
- 	// cleanAttrs now contains the attributes that are permitted
 
- 	if linkable(elementName) {
 
- 		if p.requireParseableURLs {
 
- 			// Ensure URLs are parseable:
 
- 			// - a.href
 
- 			// - area.href
 
- 			// - link.href
 
- 			// - blockquote.cite
 
- 			// - q.cite
 
- 			// - img.src
 
- 			// - script.src
 
- 			tmpAttrs := []html.Attribute{}
 
- 			for _, htmlAttr := range cleanAttrs {
 
- 				switch elementName {
 
- 				case "a", "area", "link":
 
- 					if htmlAttr.Key == "href" {
 
- 						if u, ok := p.validURL(htmlAttr.Val); ok {
 
- 							htmlAttr.Val = u
 
- 							tmpAttrs = append(tmpAttrs, htmlAttr)
 
- 						}
 
- 						break
 
- 					}
 
- 					tmpAttrs = append(tmpAttrs, htmlAttr)
 
- 				case "blockquote", "q":
 
- 					if htmlAttr.Key == "cite" {
 
- 						if u, ok := p.validURL(htmlAttr.Val); ok {
 
- 							htmlAttr.Val = u
 
- 							tmpAttrs = append(tmpAttrs, htmlAttr)
 
- 						}
 
- 						break
 
- 					}
 
- 					tmpAttrs = append(tmpAttrs, htmlAttr)
 
- 				case "img", "script":
 
- 					if htmlAttr.Key == "src" {
 
- 						if u, ok := p.validURL(htmlAttr.Val); ok {
 
- 							htmlAttr.Val = u
 
- 							tmpAttrs = append(tmpAttrs, htmlAttr)
 
- 						}
 
- 						break
 
- 					}
 
- 					tmpAttrs = append(tmpAttrs, htmlAttr)
 
- 				default:
 
- 					tmpAttrs = append(tmpAttrs, htmlAttr)
 
- 				}
 
- 			}
 
- 			cleanAttrs = tmpAttrs
 
- 		}
 
- 		if (p.requireNoFollow ||
 
- 			p.requireNoFollowFullyQualifiedLinks ||
 
- 			p.addTargetBlankToFullyQualifiedLinks) &&
 
- 			len(cleanAttrs) > 0 {
 
- 			// Add rel="nofollow" if a "href" exists
 
- 			switch elementName {
 
- 			case "a", "area", "link":
 
- 				var hrefFound bool
 
- 				var externalLink bool
 
- 				for _, htmlAttr := range cleanAttrs {
 
- 					if htmlAttr.Key == "href" {
 
- 						hrefFound = true
 
- 						u, err := url.Parse(htmlAttr.Val)
 
- 						if err != nil {
 
- 							continue
 
- 						}
 
- 						if u.Host != "" {
 
- 							externalLink = true
 
- 						}
 
- 						continue
 
- 					}
 
- 				}
 
- 				if hrefFound {
 
- 					var (
 
- 						noFollowFound    bool
 
- 						targetBlankFound bool
 
- 					)
 
- 					addNoFollow := (p.requireNoFollow ||
 
- 						externalLink && p.requireNoFollowFullyQualifiedLinks)
 
- 					addTargetBlank := (externalLink &&
 
- 						p.addTargetBlankToFullyQualifiedLinks)
 
- 					tmpAttrs := []html.Attribute{}
 
- 					for _, htmlAttr := range cleanAttrs {
 
- 						var appended bool
 
- 						if htmlAttr.Key == "rel" && addNoFollow {
 
- 							if strings.Contains(htmlAttr.Val, "nofollow") {
 
- 								noFollowFound = true
 
- 								tmpAttrs = append(tmpAttrs, htmlAttr)
 
- 								appended = true
 
- 							} else {
 
- 								htmlAttr.Val += " nofollow"
 
- 								noFollowFound = true
 
- 								tmpAttrs = append(tmpAttrs, htmlAttr)
 
- 								appended = true
 
- 							}
 
- 						}
 
- 						if elementName == "a" && htmlAttr.Key == "target" {
 
- 							if htmlAttr.Val == "_blank" {
 
- 								targetBlankFound = true
 
- 							}
 
- 							if addTargetBlank && !targetBlankFound {
 
- 								htmlAttr.Val = "_blank"
 
- 								targetBlankFound = true
 
- 								tmpAttrs = append(tmpAttrs, htmlAttr)
 
- 								appended = true
 
- 							}
 
- 						}
 
- 						if !appended {
 
- 							tmpAttrs = append(tmpAttrs, htmlAttr)
 
- 						}
 
- 					}
 
- 					if noFollowFound || targetBlankFound {
 
- 						cleanAttrs = tmpAttrs
 
- 					}
 
- 					if addNoFollow && !noFollowFound {
 
- 						rel := html.Attribute{}
 
- 						rel.Key = "rel"
 
- 						rel.Val = "nofollow"
 
- 						cleanAttrs = append(cleanAttrs, rel)
 
- 					}
 
- 					if elementName == "a" && addTargetBlank && !targetBlankFound {
 
- 						rel := html.Attribute{}
 
- 						rel.Key = "target"
 
- 						rel.Val = "_blank"
 
- 						targetBlankFound = true
 
- 						cleanAttrs = append(cleanAttrs, rel)
 
- 					}
 
- 					if targetBlankFound {
 
- 						// target="_blank" has a security risk that allows the
 
- 						// opened window/tab to issue JavaScript calls against
 
- 						// window.opener, which in effect allow the destination
 
- 						// of the link to control the source:
 
- 						// https://dev.to/ben/the-targetblank-vulnerability-by-example
 
- 						//
 
- 						// To mitigate this risk, we need to add a specific rel
 
- 						// attribute if it is not already present.
 
- 						// rel="noopener"
 
- 						//
 
- 						// Unfortunately this is processing the rel twice (we
 
- 						// already looked at it earlier ^^) as we cannot be sure
 
- 						// of the ordering of the href and rel, and whether we
 
- 						// have fully satisfied that we need to do this. This
 
- 						// double processing only happens *if* target="_blank"
 
- 						// is true.
 
- 						var noOpenerAdded bool
 
- 						tmpAttrs := []html.Attribute{}
 
- 						for _, htmlAttr := range cleanAttrs {
 
- 							var appended bool
 
- 							if htmlAttr.Key == "rel" {
 
- 								if strings.Contains(htmlAttr.Val, "noopener") {
 
- 									noOpenerAdded = true
 
- 									tmpAttrs = append(tmpAttrs, htmlAttr)
 
- 								} else {
 
- 									htmlAttr.Val += " noopener"
 
- 									noOpenerAdded = true
 
- 									tmpAttrs = append(tmpAttrs, htmlAttr)
 
- 								}
 
- 								appended = true
 
- 							}
 
- 							if !appended {
 
- 								tmpAttrs = append(tmpAttrs, htmlAttr)
 
- 							}
 
- 						}
 
- 						if noOpenerAdded {
 
- 							cleanAttrs = tmpAttrs
 
- 						} else {
 
- 							// rel attr was not found, or else noopener would
 
- 							// have been added already
 
- 							rel := html.Attribute{}
 
- 							rel.Key = "rel"
 
- 							rel.Val = "noopener"
 
- 							cleanAttrs = append(cleanAttrs, rel)
 
- 						}
 
- 					}
 
- 				}
 
- 			default:
 
- 			}
 
- 		}
 
- 	}
 
- 	return cleanAttrs
 
- }
 
- func (p *Policy) allowNoAttrs(elementName string) bool {
 
- 	_, ok := p.setOfElementsAllowedWithoutAttrs[elementName]
 
- 	return ok
 
- }
 
- func (p *Policy) validURL(rawurl string) (string, bool) {
 
- 	if p.requireParseableURLs {
 
- 		// URLs do not contain whitespace
 
- 		if strings.Contains(rawurl, " ") ||
 
- 			strings.Contains(rawurl, "\t") ||
 
- 			strings.Contains(rawurl, "\n") {
 
- 			return "", false
 
- 		}
 
- 		u, err := url.Parse(rawurl)
 
- 		if err != nil {
 
- 			return "", false
 
- 		}
 
- 		if u.Scheme != "" {
 
- 			urlPolicy, ok := p.allowURLSchemes[u.Scheme]
 
- 			if !ok {
 
- 				return "", false
 
- 			}
 
- 			if urlPolicy == nil || urlPolicy(u) == true {
 
- 				return u.String(), true
 
- 			}
 
- 			return "", false
 
- 		}
 
- 		if p.allowRelativeURLs {
 
- 			if u.String() != "" {
 
- 				return u.String(), true
 
- 			}
 
- 		}
 
- 		return "", false
 
- 	}
 
- 	return rawurl, true
 
- }
 
- func linkable(elementName string) bool {
 
- 	switch elementName {
 
- 	case "a", "area", "blockquote", "img", "link", "script":
 
- 		return true
 
- 	default:
 
- 		return false
 
- 	}
 
- }
 
 
  |