// Copyright 2009 The Go Authors. All rights reserved.// Use of this source code is governed by a BSD-style// license that can be found in the LICENSE file.
// Package strings implements simple functions to manipulate UTF-8 encoded strings.//// For information about UTF-8 strings in Go, see https://blog.golang.org/strings.
package stringsimport ()// explode splits s into a slice of UTF-8 strings,// one string per Unicode character up to a maximum of n (n < 0 means no limit).// Invalid UTF-8 sequences become correct encodings of U+FFFD.func ( string, int) []string { := utf8.RuneCountInString()if < 0 || > { = } := make([]string, )for := 0; < -1; ++ { , := utf8.DecodeRuneInString() [] = [:] = [:]if == utf8.RuneError { [] = string(utf8.RuneError) } }if > 0 { [-1] = }return}// Count counts the number of non-overlapping instances of substr in s.// If substr is an empty string, Count returns 1 + the number of Unicode code points in s.func (, string) int {// special caseiflen() == 0 {returnutf8.RuneCountInString() + 1 }iflen() == 1 {returnbytealg.CountString(, [0]) } := 0for { := Index(, )if == -1 {return } ++ = [+len():] }}// Contains reports whether substr is within s.func (, string) bool {returnIndex(, ) >= 0}// ContainsAny reports whether any Unicode code points in chars are within s.func (, string) bool {returnIndexAny(, ) >= 0}// ContainsRune reports whether the Unicode code point r is within s.func ( string, rune) bool {returnIndexRune(, ) >= 0}// LastIndex returns the index of the last instance of substr in s, or -1 if substr is not present in s.func (, string) int { := len()switch {case == 0:returnlen()case == 1:returnLastIndexByte(, [0])case == len():if == {return0 }return -1case > len():return -1 }// Rabin-Karp search from the end of the string , := bytealg.HashStrRev() := len() - varuint32for := len() - 1; >= ; -- { = *bytealg.PrimeRK + uint32([]) }if == && [:] == {return }for := - 1; >= 0; -- { *= bytealg.PrimeRK += uint32([]) -= * uint32([+])if == && [:+] == {return } }return -1}// IndexByte returns the index of the first instance of c in s, or -1 if c is not present in s.func ( string, byte) int {returnbytealg.IndexByteString(, )}// IndexRune returns the index of the first instance of the Unicode code point// r, or -1 if rune is not present in s.// If r is utf8.RuneError, it returns the first instance of any// invalid UTF-8 byte sequence.func ( string, rune) int {switch {case0 <= && < utf8.RuneSelf:returnIndexByte(, byte())case == utf8.RuneError:for , := range {if == utf8.RuneError {return } }return -1case !utf8.ValidRune():return -1default:returnIndex(, string()) }}// IndexAny returns the index of the first instance of any Unicode code point// from chars in s, or -1 if no Unicode code point from chars is present in s.func (, string) int {if == "" {// Avoid scanning all of s.return -1 }iflen() == 1 {// Avoid scanning all of s. := rune([0])if >= utf8.RuneSelf { = utf8.RuneError }returnIndexRune(, ) }iflen() > 8 {if , := makeASCIISet(); {for := 0; < len(); ++ {if .contains([]) {return } }return -1 } }for , := range {ifIndexRune(, ) >= 0 {return } }return -1}// LastIndexAny returns the index of the last instance of any Unicode code// point from chars in s, or -1 if no Unicode code point from chars is// present in s.func (, string) int {if == "" {// Avoid scanning all of s.return -1 }iflen() == 1 { := rune([0])if >= utf8.RuneSelf { = utf8.RuneError }ifIndexRune(, ) >= 0 {return0 }return -1 }iflen() > 8 {if , := makeASCIISet(); {for := len() - 1; >= 0; -- {if .contains([]) {return } }return -1 } }iflen() == 1 { := rune([0])if >= utf8.RuneSelf { = utf8.RuneError }for := len(); > 0; { , := utf8.DecodeLastRuneInString([:]) -= if == {return } }return -1 }for := len(); > 0; { , := utf8.DecodeLastRuneInString([:]) -= ifIndexRune(, ) >= 0 {return } }return -1}// LastIndexByte returns the index of the last instance of c in s, or -1 if c is not present in s.func ( string, byte) int {for := len() - 1; >= 0; -- {if [] == {return } }return -1}// Generic split: splits after each instance of sep,// including sepSave bytes of sep in the subarrays.func (, string, , int) []string {if == 0 {returnnil }if == "" {returnexplode(, ) }if < 0 { = Count(, ) + 1 } := make([]string, ) -- := 0for < { := Index(, )if < 0 {break } [] = [:+] = [+len():] ++ } [] = return [:+1]}// SplitN slices s into substrings separated by sep and returns a slice of// the substrings between those separators.//// The count determines the number of substrings to return:// n > 0: at most n substrings; the last substring will be the unsplit remainder.// n == 0: the result is nil (zero substrings)// n < 0: all substrings//// Edge cases for s and sep (for example, empty strings) are handled// as described in the documentation for Split.func (, string, int) []string { returngenSplit(, , 0, ) }// SplitAfterN slices s into substrings after each instance of sep and// returns a slice of those substrings.//// The count determines the number of substrings to return:// n > 0: at most n substrings; the last substring will be the unsplit remainder.// n == 0: the result is nil (zero substrings)// n < 0: all substrings//// Edge cases for s and sep (for example, empty strings) are handled// as described in the documentation for SplitAfter.func (, string, int) []string {returngenSplit(, , len(), )}// Split slices s into all substrings separated by sep and returns a slice of// the substrings between those separators.//// If s does not contain sep and sep is not empty, Split returns a// slice of length 1 whose only element is s.//// If sep is empty, Split splits after each UTF-8 sequence. If both s// and sep are empty, Split returns an empty slice.//// It is equivalent to SplitN with a count of -1.func (, string) []string { returngenSplit(, , 0, -1) }// SplitAfter slices s into all substrings after each instance of sep and// returns a slice of those substrings.//// If s does not contain sep and sep is not empty, SplitAfter returns// a slice of length 1 whose only element is s.//// If sep is empty, SplitAfter splits after each UTF-8 sequence. If// both s and sep are empty, SplitAfter returns an empty slice.//// It is equivalent to SplitAfterN with a count of -1.func (, string) []string {returngenSplit(, , len(), -1)}varasciiSpace = [256]uint8{'\t': 1, '\n': 1, '\v': 1, '\f': 1, '\r': 1, ' ': 1}// Fields splits the string s around each instance of one or more consecutive white space// characters, as defined by unicode.IsSpace, returning a slice of substrings of s or an// empty slice if s contains only white space.func ( string) []string {// First count the fields. // This is an exact count if s is ASCII, otherwise it is an approximation. := 0 := 1// setBits is used to track which bits are set in the bytes of s. := uint8(0)for := 0; < len(); ++ { := [] |= := int(asciiSpace[]) += & ^ = }if >= utf8.RuneSelf {// Some runes in the input string are not ASCII.returnFieldsFunc(, unicode.IsSpace) }// ASCII fast path := make([]string, ) := 0 := 0 := 0// Skip spaces in the front of the input.for < len() && asciiSpace[[]] != 0 { ++ } = for < len() {ifasciiSpace[[]] == 0 { ++continue } [] = [:] ++ ++// Skip spaces in between fields.for < len() && asciiSpace[[]] != 0 { ++ } = }if < len() { // Last field might end at EOF. [] = [:] }return}// FieldsFunc splits the string s at each run of Unicode code points c satisfying f(c)// and returns an array of slices of s. If all code points in s satisfy f(c) or the// string is empty, an empty slice is returned.//// FieldsFunc makes no guarantees about the order in which it calls f(c)// and assumes that f always returns the same value for a given c.func ( string, func(rune) bool) []string {// A span is used to record a slice of s of the form s[start:end]. // The start index is inclusive and the end index is exclusive.typestruct {intint } := make([], 0, 32)// Find the field start and end indices. // Doing this in a separate pass (rather than slicing the string s // and collecting the result substrings right away) is significantly // more efficient, possibly due to cache effects. := -1// valid span start if >= 0for , := range {if () {if >= 0 { = append(, {, })// Set start to a negative value. // Note: using -1 here consistently and reproducibly // slows down this code by a several percent on amd64. = ^ } } else {if < 0 { = } } }// Last field might end at EOF.if >= 0 { = append(, {, len()}) }// Create strings from recorded field indices. := make([]string, len())for , := range { [] = [.:.] }return}// Join concatenates the elements of its first argument to create a single string. The separator// string sep is placed between elements in the resulting string.func ( []string, string) string {switchlen() {case0:return""case1:return [0] } := len() * (len() - 1)for := 0; < len(); ++ { += len([]) }varBuilder .Grow() .WriteString([0])for , := range [1:] { .WriteString() .WriteString() }return .String()}// HasPrefix tests whether the string s begins with prefix.func (, string) bool {returnlen() >= len() && [0:len()] == }// HasSuffix tests whether the string s ends with suffix.func (, string) bool {returnlen() >= len() && [len()-len():] == }// Map returns a copy of the string s with all its characters modified// according to the mapping function. If mapping returns a negative value, the character is// dropped from the string with no replacement.func ( func(rune) rune, string) string {// In the worst case, the string can grow when mapped, making // things unpleasant. But it's so rare we barge in assuming it's // fine. It could also shrink but that falls out naturally.// The output buffer b is initialized on demand, the first // time a character differs.varBuilderfor , := range { := ()if == && != utf8.RuneError {continue }varintif == utf8.RuneError { , = utf8.DecodeRuneInString([:])if != 1 && == {continue } } else { = utf8.RuneLen() } .Grow(len() + utf8.UTFMax) .WriteString([:])if >= 0 { .WriteRune() } = [+:]break }// Fast path for unchanged inputif .Cap() == 0 { // didn't call b.Grow abovereturn }for , := range { := ()if >= 0 {// common case // Due to inlining, it is more performant to determine if WriteByte should be // invoked rather than always call WriteRuneif < utf8.RuneSelf { .WriteByte(byte()) } else {// r is not a ASCII rune. .WriteRune() } } }return .String()}// Repeat returns a new string consisting of count copies of the string s.//// It panics if count is negative or if// the result of (len(s) * count) overflows.func ( string, int) string {if == 0 {return"" }// Since we cannot return an error on overflow, // we should panic if the repeat will generate // an overflow. // See Issue golang.org/issue/16237if < 0 {panic("strings: negative Repeat count") } elseiflen()*/ != len() {panic("strings: Repeat count causes overflow") } := len() * varBuilder .Grow() .WriteString()for .Len() < {if .Len() <= /2 { .WriteString(.String()) } else { .WriteString(.String()[:-.Len()])break } }return .String()}// ToUpper returns s with all Unicode letters mapped to their upper case.func ( string) string { , := true, falsefor := 0; < len(); ++ { := []if >= utf8.RuneSelf { = falsebreak } = || ('a' <= && <= 'z') }if { // optimize for ASCII-only strings.if ! {return }varBuilder .Grow(len())for := 0; < len(); ++ { := []if'a' <= && <= 'z' { -= 'a' - 'A' } .WriteByte() }return .String() }returnMap(unicode.ToUpper, )}// ToLower returns s with all Unicode letters mapped to their lower case.func ( string) string { , := true, falsefor := 0; < len(); ++ { := []if >= utf8.RuneSelf { = falsebreak } = || ('A' <= && <= 'Z') }if { // optimize for ASCII-only strings.if ! {return }varBuilder .Grow(len())for := 0; < len(); ++ { := []if'A' <= && <= 'Z' { += 'a' - 'A' } .WriteByte() }return .String() }returnMap(unicode.ToLower, )}// ToTitle returns a copy of the string s with all Unicode letters mapped to// their Unicode title case.func ( string) string { returnMap(unicode.ToTitle, ) }// ToUpperSpecial returns a copy of the string s with all Unicode letters mapped to their// upper case using the case mapping specified by c.func ( unicode.SpecialCase, string) string {returnMap(.ToUpper, )}// ToLowerSpecial returns a copy of the string s with all Unicode letters mapped to their// lower case using the case mapping specified by c.func ( unicode.SpecialCase, string) string {returnMap(.ToLower, )}// ToTitleSpecial returns a copy of the string s with all Unicode letters mapped to their// Unicode title case, giving priority to the special casing rules.func ( unicode.SpecialCase, string) string {returnMap(.ToTitle, )}// ToValidUTF8 returns a copy of the string s with each run of invalid UTF-8 byte sequences// replaced by the replacement string, which may be empty.func (, string) string {varBuilderfor , := range {if != utf8.RuneError {continue } , := utf8.DecodeRuneInString([:])if == 1 { .Grow(len() + len()) .WriteString([:]) = [:]break } }// Fast path for unchanged inputif .Cap() == 0 { // didn't call b.Grow abovereturn } := false// previous byte was from an invalid UTF-8 sequencefor := 0; < len(); { := []if < utf8.RuneSelf { ++ = false .WriteByte()continue } , := utf8.DecodeRuneInString([:])if == 1 { ++if ! { = true .WriteString() }continue } = false .WriteString([ : +]) += }return .String()}// isSeparator reports whether the rune could mark a word boundary.// TODO: update when package unicode captures more of the properties.func ( rune) bool {// ASCII alphanumerics and underscore are not separatorsif <= 0x7F {switch {case'0' <= && <= '9':returnfalsecase'a' <= && <= 'z':returnfalsecase'A' <= && <= 'Z':returnfalsecase == '_':returnfalse }returntrue }// Letters and digits are not separatorsifunicode.IsLetter() || unicode.IsDigit() {returnfalse }// Otherwise, all we can do for now is treat spaces as separators.returnunicode.IsSpace()}// Title returns a copy of the string s with all Unicode letters that begin words// mapped to their Unicode title case.//// BUG(rsc): The rule Title uses for word boundaries does not handle Unicode punctuation properly.func ( string) string {// Use a closure here to remember state. // Hackish but effective. Depends on Map scanning in order and calling // the closure once per rune. := ' 'returnMap(func( rune) rune {ifisSeparator() { = returnunicode.ToTitle() } = return }, )}// TrimLeftFunc returns a slice of the string s with all leading// Unicode code points c satisfying f(c) removed.func ( string, func(rune) bool) string { := indexFunc(, , false)if == -1 {return"" }return [:]}// TrimRightFunc returns a slice of the string s with all trailing// Unicode code points c satisfying f(c) removed.func ( string, func(rune) bool) string { := lastIndexFunc(, , false)if >= 0 && [] >= utf8.RuneSelf { , := utf8.DecodeRuneInString([:]) += } else { ++ }return [0:]}// TrimFunc returns a slice of the string s with all leading// and trailing Unicode code points c satisfying f(c) removed.func ( string, func(rune) bool) string {returnTrimRightFunc(TrimLeftFunc(, ), )}// IndexFunc returns the index into s of the first Unicode// code point satisfying f(c), or -1 if none do.func ( string, func(rune) bool) int {returnindexFunc(, , true)}// LastIndexFunc returns the index into s of the last// Unicode code point satisfying f(c), or -1 if none do.func ( string, func(rune) bool) int {returnlastIndexFunc(, , true)}// indexFunc is the same as IndexFunc except that if// truth==false, the sense of the predicate function is// inverted.func ( string, func(rune) bool, bool) int {for , := range {if () == {return } }return -1}// lastIndexFunc is the same as LastIndexFunc except that if// truth==false, the sense of the predicate function is// inverted.func ( string, func(rune) bool, bool) int {for := len(); > 0; { , := utf8.DecodeLastRuneInString([0:]) -= if () == {return } }return -1}// asciiSet is a 32-byte value, where each bit represents the presence of a// given ASCII character in the set. The 128-bits of the lower 16 bytes,// starting with the least-significant bit of the lowest word to the// most-significant bit of the highest word, map to the full range of all// 128 ASCII characters. The 128-bits of the upper 16 bytes will be zeroed,// ensuring that any non-ASCII character will be reported as not in the set.typeasciiSet [8]uint32// makeASCIISet creates a set of ASCII characters and reports whether all// characters in chars are ASCII.func ( string) ( asciiSet, bool) {for := 0; < len(); ++ { := []if >= utf8.RuneSelf {return , false } [>>5] |= 1 << uint(&31) }return , true}// contains reports whether c is inside the set.func ( *asciiSet) ( byte) bool {return ([>>5] & (1 << uint(&31))) != 0}func ( string) func(rune) bool {iflen() == 1 && [0] < utf8.RuneSelf {returnfunc( rune) bool {return == rune([0]) } }if , := makeASCIISet(); {returnfunc( rune) bool {return < utf8.RuneSelf && .contains(byte()) } }returnfunc( rune) bool { returnIndexRune(, ) >= 0 }}// Trim returns a slice of the string s with all leading and// trailing Unicode code points contained in cutset removed.func (, string) string {if == "" || == "" {return }returnTrimFunc(, makeCutsetFunc())}// TrimLeft returns a slice of the string s with all leading// Unicode code points contained in cutset removed.//// To remove a prefix, use TrimPrefix instead.func (, string) string {if == "" || == "" {return }returnTrimLeftFunc(, makeCutsetFunc())}// TrimRight returns a slice of the string s, with all trailing// Unicode code points contained in cutset removed.//// To remove a suffix, use TrimSuffix instead.func (, string) string {if == "" || == "" {return }returnTrimRightFunc(, makeCutsetFunc())}// TrimSpace returns a slice of the string s, with all leading// and trailing white space removed, as defined by Unicode.func ( string) string {// Fast path for ASCII: look for the first ASCII non-space byte := 0for ; < len(); ++ { := []if >= utf8.RuneSelf {// If we run into a non-ASCII byte, fall back to the // slower unicode-aware method on the remaining bytesreturnTrimFunc([:], unicode.IsSpace) }ifasciiSpace[] == 0 {break } }// Now look for the first ASCII non-space byte from the end := len()for ; > ; -- { := [-1]if >= utf8.RuneSelf {returnTrimFunc([:], unicode.IsSpace) }ifasciiSpace[] == 0 {break } }// At this point s[start:stop] starts and ends with an ASCII // non-space bytes, so we're done. Non-ASCII cases have already // been handled above.return [:]}// TrimPrefix returns s without the provided leading prefix string.// If s doesn't start with prefix, s is returned unchanged.func (, string) string {ifHasPrefix(, ) {return [len():] }return}// TrimSuffix returns s without the provided trailing suffix string.// If s doesn't end with suffix, s is returned unchanged.func (, string) string {ifHasSuffix(, ) {return [:len()-len()] }return}// Replace returns a copy of the string s with the first n// non-overlapping instances of old replaced by new.// If old is empty, it matches at the beginning of the string// and after each UTF-8 sequence, yielding up to k+1 replacements// for a k-rune string.// If n < 0, there is no limit on the number of replacements.func (, , string, int) string {if == || == 0 {return// avoid allocation }// Compute number of replacements.if := Count(, ); == 0 {return// avoid allocation } elseif < 0 || < { = }// Apply replacements to buffer.varBuilder .Grow(len() + *(len()-len())) := 0for := 0; < ; ++ { := iflen() == 0 {if > 0 { , := utf8.DecodeRuneInString([:]) += } } else { += Index([:], ) } .WriteString([:]) .WriteString() = + len() } .WriteString([:])return .String()}// ReplaceAll returns a copy of the string s with all// non-overlapping instances of old replaced by new.// If old is empty, it matches at the beginning of the string// and after each UTF-8 sequence, yielding up to k+1 replacements// for a k-rune string.func (, , string) string {returnReplace(, , , -1)}// EqualFold reports whether s and t, interpreted as UTF-8 strings,// are equal under Unicode case-folding, which is a more general// form of case-insensitivity.func (, string) bool {for != "" && != "" {// Extract first rune from each string.var , runeif [0] < utf8.RuneSelf { , = rune([0]), [1:] } else { , := utf8.DecodeRuneInString() , = , [:] }if [0] < utf8.RuneSelf { , = rune([0]), [1:] } else { , := utf8.DecodeRuneInString() , = , [:] }// If they match, keep going; if not, return false.// Easy case.if == {continue }// Make sr < tr to simplify what follows.if < { , = , }// Fast check for ASCII.if < utf8.RuneSelf {// ASCII only, sr/tr must be upper/lower caseif'A' <= && <= 'Z' && == +'a'-'A' {continue }returnfalse }// General case. SimpleFold(x) returns the next equivalent rune > x // or wraps around to smaller values. := unicode.SimpleFold()for != && < { = unicode.SimpleFold() }if == {continue }returnfalse }// One string is empty. Are both?return == }// Index returns the index of the first instance of substr in s, or -1 if substr is not present in s.func (, string) int { := len()switch {case == 0:return0case == 1:returnIndexByte(, [0])case == len():if == {return0 }return -1case > len():return -1case <= bytealg.MaxLen:// Use brute force when s and substr both are smalliflen() <= bytealg.MaxBruteForce {returnbytealg.IndexString(, ) } := [0] := [1] := 0 := len() - + 1 := 0for < {if [] != {// IndexByte is faster than bytealg.IndexString, so use it as long as // we're not getting lots of false positives. := IndexByte([+1:], )if < 0 {return -1 } += + 1 }if [+1] == && [:+] == {return } ++ ++// Switch to bytealg.IndexString when IndexByte produces too many false positives.if > bytealg.Cutover() { := bytealg.IndexString([:], )if >= 0 {return + }return -1 } }return -1 } := [0] := [1] := 0 := len() - + 1 := 0for < {if [] != { := IndexByte([+1:], )if < 0 {return -1 } += + 1 }if [+1] == && [:+] == {return } ++ ++if >= 4+>>4 && < {// See comment in ../bytes/bytes.go. := bytealg.IndexRabinKarp([:], )if < 0 {return -1 }return + } }return -1}