| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179 | 
							- package mahonia
 
- // decoding HTML entities
 
- import (
 
- 	"sort"
 
- )
 
- // EntityDecoder returns a Decoder that decodes HTML character entities.
 
- // If there is no valid character entity at the current position, it returns INVALID_CHAR.
 
- // So it needs to be combined with another Decoder via FallbackDecoder.
 
- func EntityDecoder() Decoder {
 
- 	var leftover rune // leftover rune from two-rune entity
 
- 	return func(p []byte) (r rune, size int, status Status) {
 
- 		if leftover != 0 {
 
- 			r = leftover
 
- 			leftover = 0
 
- 			return r, 0, SUCCESS
 
- 		}
 
- 		if len(p) == 0 {
 
- 			return 0, 0, NO_ROOM
 
- 		}
 
- 		if p[0] != '&' {
 
- 			return 0xfffd, 1, INVALID_CHAR
 
- 		}
 
- 		if len(p) < 3 {
 
- 			return 0, 1, NO_ROOM
 
- 		}
 
- 		r, size, status = 0xfffd, 1, INVALID_CHAR
 
- 		n := 1 // number of bytes read so far
 
- 		if p[n] == '#' {
 
- 			n++
 
- 			c := p[n]
 
- 			hex := false
 
- 			if c == 'x' || c == 'X' {
 
- 				hex = true
 
- 				n++
 
- 			}
 
- 			var x rune
 
- 			for n < len(p) {
 
- 				c = p[n]
 
- 				n++
 
- 				if hex {
 
- 					if '0' <= c && c <= '9' {
 
- 						x = 16*x + rune(c) - '0'
 
- 						continue
 
- 					} else if 'a' <= c && c <= 'f' {
 
- 						x = 16*x + rune(c) - 'a' + 10
 
- 						continue
 
- 					} else if 'A' <= c && c <= 'F' {
 
- 						x = 16*x + rune(c) - 'A' + 10
 
- 						continue
 
- 					}
 
- 				} else if '0' <= c && c <= '9' {
 
- 					x = 10*x + rune(c) - '0'
 
- 					continue
 
- 				}
 
- 				if c != ';' {
 
- 					n--
 
- 				}
 
- 				break
 
- 			}
 
- 			if n == len(p) && p[n-1] != ';' {
 
- 				return 0, 0, NO_ROOM
 
- 			}
 
- 			size = n
 
- 			if p[n-1] == ';' {
 
- 				n--
 
- 			}
 
- 			if hex {
 
- 				n--
 
- 			}
 
- 			n--
 
- 			// Now n is the number of actual digits read.
 
- 			if n == 0 {
 
- 				return 0xfffd, 1, INVALID_CHAR
 
- 			}
 
- 			if 0x80 <= x && x <= 0x9F {
 
- 				// Replace characters from Windows-1252 with UTF-8 equivalents.
 
- 				x = replacementTable[x-0x80]
 
- 			} else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF {
 
- 				// Replace invalid characters with the replacement character.
 
- 				return 0xfffd, size, INVALID_CHAR
 
- 			}
 
- 			r = x
 
- 			status = SUCCESS
 
- 			return
 
- 		}
 
- 		// Look for a named entity in EntityList.
 
- 		possible := entityList
 
- 		for len(possible) > 0 {
 
- 			if len(p) <= n {
 
- 				leftover = 0
 
- 				return 0, 0, NO_ROOM
 
- 			}
 
- 			c := p[n]
 
- 			// Narrow down the selection in possible to those items that have c in the
 
- 			// appropriate byte.
 
- 			first := sort.Search(len(possible), func(i int) bool {
 
- 				e := possible[i].name
 
- 				if len(e) < n {
 
- 					return false
 
- 				}
 
- 				return e[n-1] >= c
 
- 			})
 
- 			possible = possible[first:]
 
- 			last := sort.Search(len(possible), func(i int) bool {
 
- 				return possible[i].name[n-1] > c
 
- 			})
 
- 			possible = possible[:last]
 
- 			n++
 
- 			if len(possible) > 0 && len(possible[0].name) == n-1 {
 
- 				r, leftover = possible[0].r1, possible[0].r2
 
- 				size = n
 
- 				status = SUCCESS
 
- 				// but don't return yet, since we need the longest match
 
- 			}
 
- 		}
 
- 		return
 
- 	}
 
- }
 
- // This table is copied from /src/pkg/html/escape.go in the Go source
 
- //
 
- // These replacements permit compatibility with old numeric entities that
 
- // assumed Windows-1252 encoding.
 
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
 
- var replacementTable = [...]rune{
 
- 	'\u20AC', // First entry is what 0x80 should be replaced with.
 
- 	'\u0081',
 
- 	'\u201A',
 
- 	'\u0192',
 
- 	'\u201E',
 
- 	'\u2026',
 
- 	'\u2020',
 
- 	'\u2021',
 
- 	'\u02C6',
 
- 	'\u2030',
 
- 	'\u0160',
 
- 	'\u2039',
 
- 	'\u0152',
 
- 	'\u008D',
 
- 	'\u017D',
 
- 	'\u008F',
 
- 	'\u0090',
 
- 	'\u2018',
 
- 	'\u2019',
 
- 	'\u201C',
 
- 	'\u201D',
 
- 	'\u2022',
 
- 	'\u2013',
 
- 	'\u2014',
 
- 	'\u02DC',
 
- 	'\u2122',
 
- 	'\u0161',
 
- 	'\u203A',
 
- 	'\u0153',
 
- 	'\u009D',
 
- 	'\u017E',
 
- 	'\u0178', // Last entry is 0x9F.
 
- 	// 0x00->'\uFFFD' is handled programmatically.
 
- 	// 0x0D->'\u000D' is a no-op.
 
- }
 
 
  |