// Copyright 2009 The Go Authors. All rights reserved.// Use of this source code is governed by a BSD-style// license that can be found in the LICENSE file.
// Package bytes implements functions for the manipulation of byte slices.// It is analogous to the facilities of the strings package.
package bytesimport ()// Equal reports whether a and b// are the same length and contain the same bytes.// A nil argument is equivalent to an empty slice.func (, []byte) bool {// Neither cmd/compile nor gccgo allocates for these string conversions.returnstring() == string()}// Compare returns an integer comparing two byte slices lexicographically.// The result will be 0 if a==b, -1 if a < b, and +1 if a > b.// A nil argument is equivalent to an empty slice.func (, []byte) int {returnbytealg.Compare(, )}// explode splits s into a slice of UTF-8 sequences, one per Unicode code point (still slices of bytes),// up to a maximum of n byte slices. Invalid UTF-8 sequences are chopped into individual bytes.func ( []byte, int) [][]byte {if <= 0 { = len() } := make([][]byte, )varint := 0forlen() > 0 {if +1 >= { [] = ++break } _, = utf8.DecodeRune() [] = [0::] = [:] ++ }return [0:]}// Count counts the number of non-overlapping instances of sep in s.// If sep is an empty slice, Count returns 1 + the number of UTF-8-encoded code points in s.func (, []byte) int {// special caseiflen() == 0 {returnutf8.RuneCount() + 1 }iflen() == 1 {returnbytealg.Count(, [0]) } := 0for { := Index(, )if == -1 {return } ++ = [+len():] }}// Contains reports whether subslice is within b.func (, []byte) bool {returnIndex(, ) != -1}// ContainsAny reports whether any of the UTF-8-encoded code points in chars are within b.func ( []byte, string) bool {returnIndexAny(, ) >= 0}// ContainsRune reports whether the rune is contained in the UTF-8-encoded byte slice b.func ( []byte, rune) bool {returnIndexRune(, ) >= 0}// IndexByte returns the index of the first instance of c in b, or -1 if c is not present in b.func ( []byte, byte) int {returnbytealg.IndexByte(, )}func ( []byte, byte) int {for , := range {if == {return } }return -1}// LastIndex returns the index of the last instance of sep in s, or -1 if sep is not present in s.func (, []byte) int { := len()switch {case == 0:returnlen()case == 1:returnLastIndexByte(, [0])case == len():ifEqual(, ) {return0 }return -1case > len():return -1 }// Rabin-Karp search from the end of the string , := bytealg.HashStrRevBytes() := len() - varuint32for := len() - 1; >= ; -- { = *bytealg.PrimeRK + uint32([]) }if == && Equal([:], ) {return }for := - 1; >= 0; -- { *= bytealg.PrimeRK += uint32([]) -= * uint32([+])if == && Equal([:+], ) {return } }return -1}// LastIndexByte returns the index of the last instance of c in s, or -1 if c is not present in s.func ( []byte, byte) int {for := len() - 1; >= 0; -- {if [] == {return } }return -1}// IndexRune interprets s as a sequence of UTF-8-encoded code points.// It returns the byte index of the first occurrence in s of the given rune.// It returns -1 if rune is not present in s.// If r is utf8.RuneError, it returns the first instance of any// invalid UTF-8 byte sequence.func ( []byte, rune) int {switch {case0 <= && < utf8.RuneSelf:returnIndexByte(, byte())case == utf8.RuneError:for := 0; < len(); { , := utf8.DecodeRune([:])if == utf8.RuneError {return } += }return -1case !utf8.ValidRune():return -1default:var [utf8.UTFMax]byte := utf8.EncodeRune([:], )returnIndex(, [:]) }}// IndexAny interprets s as a sequence of UTF-8-encoded Unicode code points.// It returns the byte index of the first occurrence in s of any of the Unicode// code points in chars. It returns -1 if chars is empty or if there is no code// point in common.func ( []byte, string) int {if == "" {// Avoid scanning all of s.return -1 }iflen() == 1 { := rune([0])if >= utf8.RuneSelf {// search utf8.RuneError.for _, = range {if == utf8.RuneError {return0 } }return -1 }ifbytealg.IndexByteString(, [0]) >= 0 {return0 }return -1 }iflen() == 1 { := rune([0])if >= utf8.RuneSelf { = utf8.RuneError }returnIndexRune(, ) }iflen() > 8 {if , := makeASCIISet(); {for , := range {if .contains() {return } }return -1 } }varintfor := 0; < len(); += { := rune([])if < utf8.RuneSelf {ifbytealg.IndexByteString(, []) >= 0 {return } = 1continue } , = utf8.DecodeRune([:])if != utf8.RuneError {// r is 2 to 4 bytesiflen() == {if == string() {return }continue }// Use bytealg.IndexString for performance if available.ifbytealg.MaxLen >= {ifbytealg.IndexString(, string()) >= 0 {return }continue } }for , := range {if == {return } } }return -1}// LastIndexAny interprets s as a sequence of UTF-8-encoded Unicode code// points. It returns the byte index of the last occurrence in s of any of// the Unicode code points in chars. It returns -1 if chars is empty or if// there is no code point in common.func ( []byte, string) int {if == "" {// Avoid scanning all of s.return -1 }iflen() > 8 {if , := makeASCIISet(); {for := len() - 1; >= 0; -- {if .contains([]) {return } }return -1 } }iflen() == 1 { := rune([0])if >= utf8.RuneSelf {for _, = range {if == utf8.RuneError {return0 } }return -1 }ifbytealg.IndexByteString(, [0]) >= 0 {return0 }return -1 }iflen() == 1 { := rune([0])if >= utf8.RuneSelf { = utf8.RuneError }for := len(); > 0; { , := utf8.DecodeLastRune([:]) -= if == {return } }return -1 }for := len(); > 0; { := rune([-1])if < utf8.RuneSelf {ifbytealg.IndexByteString(, [-1]) >= 0 {return - 1 } --continue } , := utf8.DecodeLastRune([:]) -= if != utf8.RuneError {// r is 2 to 4 bytesiflen() == {if == string() {return }continue }// Use bytealg.IndexString for performance if available.ifbytealg.MaxLen >= {ifbytealg.IndexString(, string()) >= 0 {return }continue } }for , := range {if == {return } } }return -1}// Generic split: splits after each instance of sep,// including sepSave bytes of sep in the subslices.func (, []byte, , int) [][]byte {if == 0 {returnnil }iflen() == 0 {returnexplode(, ) }if < 0 { = Count(, ) + 1 } := make([][]byte, ) -- := 0for < { := Index(, )if < 0 {break } [] = [: + : +] = [+len():] ++ } [] = return [:+1]}// SplitN slices s into subslices separated by sep and returns a slice of// the subslices between those separators.// If sep is empty, SplitN splits after each UTF-8 sequence.// The count determines the number of subslices to return:// n > 0: at most n subslices; the last subslice will be the unsplit remainder.// n == 0: the result is nil (zero subslices)// n < 0: all subslicesfunc (, []byte, int) [][]byte { returngenSplit(, , 0, ) }// SplitAfterN slices s into subslices after each instance of sep and// returns a slice of those subslices.// If sep is empty, SplitAfterN splits after each UTF-8 sequence.// The count determines the number of subslices to return:// n > 0: at most n subslices; the last subslice will be the unsplit remainder.// n == 0: the result is nil (zero subslices)// n < 0: all subslicesfunc (, []byte, int) [][]byte {returngenSplit(, , len(), )}// Split slices s into all subslices separated by sep and returns a slice of// the subslices between those separators.// If sep is empty, Split splits after each UTF-8 sequence.// It is equivalent to SplitN with a count of -1.func (, []byte) [][]byte { returngenSplit(, , 0, -1) }// SplitAfter slices s into all subslices after each instance of sep and// returns a slice of those subslices.// If sep is empty, SplitAfter splits after each UTF-8 sequence.// It is equivalent to SplitAfterN with a count of -1.func (, []byte) [][]byte {returngenSplit(, , len(), -1)}varasciiSpace = [256]uint8{'\t': 1, '\n': 1, '\v': 1, '\f': 1, '\r': 1, ' ': 1}// Fields interprets s as a sequence of UTF-8-encoded code points.// It splits the slice s around each instance of one or more consecutive white space// characters, as defined by unicode.IsSpace, returning a slice of subslices of s or an// empty slice if s contains only white space.func ( []byte) [][]byte {// First count the fields. // This is an exact count if s is ASCII, otherwise it is an approximation. := 0 := 1// setBits is used to track which bits are set in the bytes of s. := uint8(0)for := 0; < len(); ++ { := [] |= := int(asciiSpace[]) += & ^ = }if >= utf8.RuneSelf {// Some runes in the input slice are not ASCII.returnFieldsFunc(, unicode.IsSpace) }// ASCII fast path := make([][]byte, ) := 0 := 0 := 0// Skip spaces in the front of the input.for < len() && asciiSpace[[]] != 0 { ++ } = for < len() {ifasciiSpace[[]] == 0 { ++continue } [] = [::] ++ ++// Skip spaces in between fields.for < len() && asciiSpace[[]] != 0 { ++ } = }if < len() { // Last field might end at EOF. [] = [:len():len()] }return}// FieldsFunc interprets s as a sequence of UTF-8-encoded code points.// It splits the slice s at each run of code points c satisfying f(c) and// returns a slice of subslices of s. If all code points in s satisfy f(c), or// len(s) == 0, an empty slice is returned.//// FieldsFunc makes no guarantees about the order in which it calls f(c)// and assumes that f always returns the same value for a given c.func ( []byte, func(rune) bool) [][]byte {// A span is used to record a slice of s of the form s[start:end]. // The start index is inclusive and the end index is exclusive.typestruct {intint } := make([], 0, 32)// Find the field start and end indices. // Doing this in a separate pass (rather than slicing the string s // and collecting the result substrings right away) is significantly // more efficient, possibly due to cache effects. := -1// valid span start if >= 0for := 0; < len(); { := 1 := rune([])if >= utf8.RuneSelf { , = utf8.DecodeRune([:]) }if () {if >= 0 { = append(, {, }) = -1 } } else {if < 0 { = } } += }// Last field might end at EOF.if >= 0 { = append(, {, len()}) }// Create subslices from recorded field indices. := make([][]byte, len())for , := range { [] = [.:.:.] }return}// Join concatenates the elements of s to create a new byte slice. The separator// sep is placed between elements in the resulting slice.func ( [][]byte, []byte) []byte {iflen() == 0 {return []byte{} }iflen() == 1 {// Just return a copy.returnappend([]byte(nil), [0]...) } := len() * (len() - 1)for , := range { += len() } := make([]byte, ) := copy(, [0])for , := range [1:] { += copy([:], ) += copy([:], ) }return}// HasPrefix tests whether the byte slice s begins with prefix.func (, []byte) bool {returnlen() >= len() && Equal([0:len()], )}// HasSuffix tests whether the byte slice s ends with suffix.func (, []byte) bool {returnlen() >= len() && Equal([len()-len():], )}// Map returns a copy of the byte slice s with all its characters modified// according to the mapping function. If mapping returns a negative value, the character is// dropped from the byte slice with no replacement. The characters in s and the// output are interpreted as UTF-8-encoded code points.func ( func( rune) rune, []byte) []byte {// In the worst case, the slice can grow when mapped, making // things unpleasant. But it's so rare we barge in assuming it's // fine. It could also shrink but that falls out naturally. := len() // length of b := 0// number of bytes encoded in b := make([]byte, )for := 0; < len(); { := 1 := rune([])if >= utf8.RuneSelf { , = utf8.DecodeRune([:]) } = ()if >= 0 { := utf8.RuneLen()if < 0 { = len(string(utf8.RuneError)) }if + > {// Grow the buffer. = *2 + utf8.UTFMax := make([]byte, )copy(, [0:]) = } += utf8.EncodeRune([:], ) } += }return [0:]}// Repeat returns a new byte slice consisting of count copies of b.//// It panics if count is negative or if// the result of (len(b) * count) overflows.func ( []byte, int) []byte {if == 0 {return []byte{} }// Since we cannot return an error on overflow, // we should panic if the repeat will generate // an overflow. // See Issue golang.org/issue/16237.if < 0 {panic("bytes: negative Repeat count") } elseiflen()*/ != len() {panic("bytes: Repeat count causes overflow") } := make([]byte, len()*) := copy(, )for < len() {copy([:], [:]) *= 2 }return}// ToUpper returns a copy of the byte slice s with all Unicode letters mapped to// their upper case.func ( []byte) []byte { , := true, falsefor := 0; < len(); ++ { := []if >= utf8.RuneSelf { = falsebreak } = || ('a' <= && <= 'z') }if { // optimize for ASCII-only byte slices.if ! {// Just return a copy.returnappend([]byte(""), ...) } := make([]byte, len())for := 0; < len(); ++ { := []if'a' <= && <= 'z' { -= 'a' - 'A' } [] = }return }returnMap(unicode.ToUpper, )}// ToLower returns a copy of the byte slice s with all Unicode letters mapped to// their lower case.func ( []byte) []byte { , := true, falsefor := 0; < len(); ++ { := []if >= utf8.RuneSelf { = falsebreak } = || ('A' <= && <= 'Z') }if { // optimize for ASCII-only byte slices.if ! {returnappend([]byte(""), ...) } := make([]byte, len())for := 0; < len(); ++ { := []if'A' <= && <= 'Z' { += 'a' - 'A' } [] = }return }returnMap(unicode.ToLower, )}// ToTitle treats s as UTF-8-encoded bytes and returns a copy with all the Unicode letters mapped to their title case.func ( []byte) []byte { returnMap(unicode.ToTitle, ) }// ToUpperSpecial treats s as UTF-8-encoded bytes and returns a copy with all the Unicode letters mapped to their// upper case, giving priority to the special casing rules.func ( unicode.SpecialCase, []byte) []byte {returnMap(.ToUpper, )}// ToLowerSpecial treats s as UTF-8-encoded bytes and returns a copy with all the Unicode letters mapped to their// lower case, giving priority to the special casing rules.func ( unicode.SpecialCase, []byte) []byte {returnMap(.ToLower, )}// ToTitleSpecial treats s as UTF-8-encoded bytes and returns a copy with all the Unicode letters mapped to their// title case, giving priority to the special casing rules.func ( unicode.SpecialCase, []byte) []byte {returnMap(.ToTitle, )}// ToValidUTF8 treats s as UTF-8-encoded bytes and returns a copy with each run of bytes// representing invalid UTF-8 replaced with the bytes in replacement, which may be empty.func (, []byte) []byte { := make([]byte, 0, len()+len()) := false// previous byte was from an invalid UTF-8 sequencefor := 0; < len(); { := []if < utf8.RuneSelf { ++ = false = append(, byte())continue } , := utf8.DecodeRune([:])if == 1 { ++if ! { = true = append(, ...) }continue } = false = append(, [:+]...) += }return}// isSeparator reports whether the rune could mark a word boundary.// TODO: update when package unicode captures more of the properties.func ( rune) bool {// ASCII alphanumerics and underscore are not separatorsif <= 0x7F {switch {case'0' <= && <= '9':returnfalsecase'a' <= && <= 'z':returnfalsecase'A' <= && <= 'Z':returnfalsecase == '_':returnfalse }returntrue }// Letters and digits are not separatorsifunicode.IsLetter() || unicode.IsDigit() {returnfalse }// Otherwise, all we can do for now is treat spaces as separators.returnunicode.IsSpace()}// Title treats s as UTF-8-encoded bytes and returns a copy with all Unicode letters that begin// words mapped to their title case.//// BUG(rsc): The rule Title uses for word boundaries does not handle Unicode punctuation properly.func ( []byte) []byte {// Use a closure here to remember state. // Hackish but effective. Depends on Map scanning in order and calling // the closure once per rune. := ' 'returnMap(func( rune) rune {ifisSeparator() { = returnunicode.ToTitle() } = return }, )}// TrimLeftFunc treats s as UTF-8-encoded bytes and returns a subslice of s by slicing off// all leading UTF-8-encoded code points c that satisfy f(c).func ( []byte, func( rune) bool) []byte { := indexFunc(, , false)if == -1 {returnnil }return [:]}// TrimRightFunc returns a subslice of s by slicing off all trailing// UTF-8-encoded code points c that satisfy f(c).func ( []byte, func( rune) bool) []byte { := lastIndexFunc(, , false)if >= 0 && [] >= utf8.RuneSelf { , := utf8.DecodeRune([:]) += } else { ++ }return [0:]}// TrimFunc returns a subslice of s by slicing off all leading and trailing// UTF-8-encoded code points c that satisfy f(c).func ( []byte, func( rune) bool) []byte {returnTrimRightFunc(TrimLeftFunc(, ), )}// TrimPrefix returns s without the provided leading prefix string.// If s doesn't start with prefix, s is returned unchanged.func (, []byte) []byte {ifHasPrefix(, ) {return [len():] }return}// TrimSuffix returns s without the provided trailing suffix string.// If s doesn't end with suffix, s is returned unchanged.func (, []byte) []byte {ifHasSuffix(, ) {return [:len()-len()] }return}// IndexFunc interprets s as a sequence of UTF-8-encoded code points.// It returns the byte index in s of the first Unicode// code point satisfying f(c), or -1 if none do.func ( []byte, func( rune) bool) int {returnindexFunc(, , true)}// LastIndexFunc interprets s as a sequence of UTF-8-encoded code points.// It returns the byte index in s of the last Unicode// code point satisfying f(c), or -1 if none do.func ( []byte, func( rune) bool) int {returnlastIndexFunc(, , true)}// indexFunc is the same as IndexFunc except that if// truth==false, the sense of the predicate function is// inverted.func ( []byte, func( rune) bool, bool) int { := 0for < len() { := 1 := rune([])if >= utf8.RuneSelf { , = utf8.DecodeRune([:]) }if () == {return } += }return -1}// lastIndexFunc is the same as LastIndexFunc except that if// truth==false, the sense of the predicate function is// inverted.func ( []byte, func( rune) bool, bool) int {for := len(); > 0; { , := rune([-1]), 1if >= utf8.RuneSelf { , = utf8.DecodeLastRune([0:]) } -= if () == {return } }return -1}// asciiSet is a 32-byte value, where each bit represents the presence of a// given ASCII character in the set. The 128-bits of the lower 16 bytes,// starting with the least-significant bit of the lowest word to the// most-significant bit of the highest word, map to the full range of all// 128 ASCII characters. The 128-bits of the upper 16 bytes will be zeroed,// ensuring that any non-ASCII character will be reported as not in the set.typeasciiSet [8]uint32// makeASCIISet creates a set of ASCII characters and reports whether all// characters in chars are ASCII.func ( string) ( asciiSet, bool) {for := 0; < len(); ++ { := []if >= utf8.RuneSelf {return , false } [>>5] |= 1 << uint(&31) }return , true}// contains reports whether c is inside the set.func ( *asciiSet) ( byte) bool {return ([>>5] & (1 << uint(&31))) != 0}func ( string) func( rune) bool {iflen() == 1 && [0] < utf8.RuneSelf {returnfunc( rune) bool {return == rune([0]) } }if , := makeASCIISet(); {returnfunc( rune) bool {return < utf8.RuneSelf && .contains(byte()) } }returnfunc( rune) bool {for , := range {if == {returntrue } }returnfalse }}// Trim returns a subslice of s by slicing off all leading and// trailing UTF-8-encoded code points contained in cutset.func ( []byte, string) []byte {returnTrimFunc(, makeCutsetFunc())}// TrimLeft returns a subslice of s by slicing off all leading// UTF-8-encoded code points contained in cutset.func ( []byte, string) []byte {returnTrimLeftFunc(, makeCutsetFunc())}// TrimRight returns a subslice of s by slicing off all trailing// UTF-8-encoded code points that are contained in cutset.func ( []byte, string) []byte {returnTrimRightFunc(, makeCutsetFunc())}// TrimSpace returns a subslice of s by slicing off all leading and// trailing white space, as defined by Unicode.func ( []byte) []byte {// Fast path for ASCII: look for the first ASCII non-space byte := 0for ; < len(); ++ { := []if >= utf8.RuneSelf {// If we run into a non-ASCII byte, fall back to the // slower unicode-aware method on the remaining bytesreturnTrimFunc([:], unicode.IsSpace) }ifasciiSpace[] == 0 {break } }// Now look for the first ASCII non-space byte from the end := len()for ; > ; -- { := [-1]if >= utf8.RuneSelf {returnTrimFunc([:], unicode.IsSpace) }ifasciiSpace[] == 0 {break } }// At this point s[start:stop] starts and ends with an ASCII // non-space bytes, so we're done. Non-ASCII cases have already // been handled above.if == {// Special case to preserve previous TrimLeftFunc behavior, // returning nil instead of empty slice if all spaces.returnnil }return [:]}// Runes interprets s as a sequence of UTF-8-encoded code points.// It returns a slice of runes (Unicode code points) equivalent to s.func ( []byte) []rune { := make([]rune, utf8.RuneCount()) := 0forlen() > 0 { , := utf8.DecodeRune() [] = ++ = [:] }return}// Replace returns a copy of the slice s with the first n// non-overlapping instances of old replaced by new.// If old is empty, it matches at the beginning of the slice// and after each UTF-8 sequence, yielding up to k+1 replacements// for a k-rune slice.// If n < 0, there is no limit on the number of replacements.func (, , []byte, int) []byte { := 0if != 0 {// Compute number of replacements. = Count(, ) }if == 0 {// Just return a copy.returnappend([]byte(nil), ...) }if < 0 || < { = }// Apply replacements to buffer. := make([]byte, len()+*(len()-len())) := 0 := 0for := 0; < ; ++ { := iflen() == 0 {if > 0 { , := utf8.DecodeRune([:]) += } } else { += Index([:], ) } += copy([:], [:]) += copy([:], ) = + len() } += copy([:], [:])return [0:]}// ReplaceAll returns a copy of the slice s with all// non-overlapping instances of old replaced by new.// If old is empty, it matches at the beginning of the slice// and after each UTF-8 sequence, yielding up to k+1 replacements// for a k-rune slice.func (, , []byte) []byte {returnReplace(, , , -1)}// EqualFold reports whether s and t, interpreted as UTF-8 strings,// are equal under Unicode case-folding, which is a more general// form of case-insensitivity.func (, []byte) bool {forlen() != 0 && len() != 0 {// Extract first rune from each.var , runeif [0] < utf8.RuneSelf { , = rune([0]), [1:] } else { , := utf8.DecodeRune() , = , [:] }if [0] < utf8.RuneSelf { , = rune([0]), [1:] } else { , := utf8.DecodeRune() , = , [:] }// If they match, keep going; if not, return false.// Easy case.if == {continue }// Make sr < tr to simplify what follows.if < { , = , }// Fast check for ASCII.if < utf8.RuneSelf {// ASCII only, sr/tr must be upper/lower caseif'A' <= && <= 'Z' && == +'a'-'A' {continue }returnfalse }// General case. SimpleFold(x) returns the next equivalent rune > x // or wraps around to smaller values. := unicode.SimpleFold()for != && < { = unicode.SimpleFold() }if == {continue }returnfalse }// One string is empty. Are both?returnlen() == len()}// Index returns the index of the first instance of sep in s, or -1 if sep is not present in s.func (, []byte) int { := len()switch {case == 0:return0case == 1:returnIndexByte(, [0])case == len():ifEqual(, ) {return0 }return -1case > len():return -1case <= bytealg.MaxLen:// Use brute force when s and sep both are smalliflen() <= bytealg.MaxBruteForce {returnbytealg.Index(, ) } := [0] := [1] := 0 := len() - + 1 := 0for < {if [] != {// IndexByte is faster than bytealg.Index, so use it as long as // we're not getting lots of false positives. := IndexByte([+1:], )if < 0 {return -1 } += + 1 }if [+1] == && Equal([:+], ) {return } ++ ++// Switch to bytealg.Index when IndexByte produces too many false positives.if > bytealg.Cutover() { := bytealg.Index([:], )if >= 0 {return + }return -1 } }return -1 } := [0] := [1] := 0 := 0 := len() - + 1for < {if [] != { := IndexByte([+1:], )if < 0 {break } += + 1 }if [+1] == && Equal([:+], ) {return } ++ ++if >= 4+>>4 && < {// Give up on IndexByte, it isn't skipping ahead // far enough to be better than Rabin-Karp. // Experiments (using IndexPeriodic) suggest // the cutover is about 16 byte skips. // TODO: if large prefixes of sep are matching // we should cutover at even larger average skips, // because Equal becomes that much more expensive. // This code does not take that effect into account. := bytealg.IndexRabinKarpBytes([:], )if < 0 {return -1 }return + } }return -1}