{-# LANGUAGE BangPatterns, CPP, GeneralizedNewtypeDeriving, MagicHash,
    UnliftedFFITypes #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE Trustworthy #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE ViewPatterns #-}

-- |
-- Module      : Data.Text.Internal.Builder
-- License     : BSD-style (see LICENSE)
-- Stability   : experimental
--
-- /Warning/: this is an internal module, and does not have a stable
-- API or name. Functions in this module may not check or enforce
-- preconditions expected by public modules. Use at your own risk!
--
-- Internals of "Data.Text.Encoding".
--
-- @since 2.0.2
module Data.Text.Internal.Encoding
  ( validateUtf8Chunk
  , validateUtf8More
  , decodeUtf8Chunk
  , decodeUtf8More
  , decodeUtf8With1
  , decodeUtf8With2
  , Utf8State
  , startUtf8State
  , StrictBuilder()
  , strictBuilderToText
  , textToStrictBuilder

    -- * Internal
  , skipIncomplete
  , getCompleteLen
  , getPartialUtf8
  ) where

#if defined(ASSERTS)
import Control.Exception (assert)
#endif
import Data.Bits ((.&.), shiftL, shiftR)
import Data.ByteString (ByteString)
#if !MIN_VERSION_base(4,11,0)
import Data.Semigroup (Semigroup(..))
#endif
import Data.Word (Word32, Word8)
import Foreign.Storable (pokeElemOff)
import Data.Text.Encoding.Error (OnDecodeError)
import Data.Text.Internal (Text(..))
import Data.Text.Internal.Encoding.Utf8
  (DecoderState, utf8AcceptState, utf8RejectState, updateDecoderState)
import Data.Text.Internal.StrictBuilder (StrictBuilder)
import qualified Data.ByteString as B
import qualified Data.ByteString.Internal as BI
import qualified Data.ByteString.Short.Internal as SBS
import qualified Data.Text.Array as A
import qualified Data.Text.Internal.StrictBuilder as SB
#if defined(ASSERTS)
import GHC.Stack (HasCallStack)
#endif

#ifdef SIMDUTF
import Data.Text.Internal.ByteStringCompat (withBS)
import Data.Text.Internal.Unsafe (unsafeWithForeignPtr)
import Data.Text.Unsafe (unsafeDupablePerformIO)
import Foreign.C.Types (CSize(..))
import Foreign.C.Types (CInt(..))
import Foreign.Ptr (Ptr)
#endif

-- | Use 'StrictBuilder' to build 'Text'.
--
-- @since 2.0.2
strictBuilderToText :: StrictBuilder -> Text
strictBuilderToText :: StrictBuilder -> Text
strictBuilderToText = StrictBuilder -> Text
SB.toText

-- | Copy 'Text' in a 'StrictBuilder'
--
-- @since 2.0.2
textToStrictBuilder :: Text -> StrictBuilder
textToStrictBuilder :: Text -> StrictBuilder
textToStrictBuilder = Text -> StrictBuilder
SB.fromText

-- | State of decoding a 'ByteString' in UTF-8.
-- Enables incremental decoding ('validateUtf8Chunk', 'validateUtf8More',
-- 'decodeUtf8Chunk', 'decodeUtf8More').
--
-- @since 2.0.2

-- Internal invariant:
-- the first component is the initial state if and only if
-- the second component is empty.
--
-- @
-- 'utf9CodePointState' s = 'utf8StartState'
-- <=>
-- 'partialUtf8CodePoint' s = 'PartialUtf8CodePoint' 0
-- @
data Utf8State = Utf8State
  { -- | State of the UTF-8 state machine.
    Utf8State -> DecoderState
utf8CodePointState :: {-# UNPACK #-} !DecoderState
    -- | Bytes of the currently incomplete code point (if any).
  , Utf8State -> PartialUtf8CodePoint
partialUtf8CodePoint :: {-# UNPACK #-} !PartialUtf8CodePoint
  }
  deriving (Utf8State -> Utf8State -> Bool
(Utf8State -> Utf8State -> Bool)
-> (Utf8State -> Utf8State -> Bool) -> Eq Utf8State
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: Utf8State -> Utf8State -> Bool
== :: Utf8State -> Utf8State -> Bool
$c/= :: Utf8State -> Utf8State -> Bool
/= :: Utf8State -> Utf8State -> Bool
Eq, Int -> Utf8State -> ShowS
[Utf8State] -> ShowS
Utf8State -> String
(Int -> Utf8State -> ShowS)
-> (Utf8State -> String)
-> ([Utf8State] -> ShowS)
-> Show Utf8State
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> Utf8State -> ShowS
showsPrec :: Int -> Utf8State -> ShowS
$cshow :: Utf8State -> String
show :: Utf8State -> String
$cshowList :: [Utf8State] -> ShowS
showList :: [Utf8State] -> ShowS
Show)

-- | Initial 'Utf8State'.
--
-- @since 2.0.2
startUtf8State :: Utf8State
startUtf8State :: Utf8State
startUtf8State = DecoderState -> PartialUtf8CodePoint -> Utf8State
Utf8State DecoderState
utf8AcceptState PartialUtf8CodePoint
partUtf8Empty

-- | Prefix of a UTF-8 code point encoded in 4 bytes,
-- possibly empty.
--
-- - The most significant byte contains the number of bytes,
--   between 0 and 3.
-- - The remaining bytes hold the incomplete code point.
-- - Unused bytes must be 0.
--
-- All of operations available on it are the functions below.
-- The constructor should never be used outside of those.
--
-- @since 2.0.2
newtype PartialUtf8CodePoint = PartialUtf8CodePoint Word32
  deriving (PartialUtf8CodePoint -> PartialUtf8CodePoint -> Bool
(PartialUtf8CodePoint -> PartialUtf8CodePoint -> Bool)
-> (PartialUtf8CodePoint -> PartialUtf8CodePoint -> Bool)
-> Eq PartialUtf8CodePoint
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: PartialUtf8CodePoint -> PartialUtf8CodePoint -> Bool
== :: PartialUtf8CodePoint -> PartialUtf8CodePoint -> Bool
$c/= :: PartialUtf8CodePoint -> PartialUtf8CodePoint -> Bool
/= :: PartialUtf8CodePoint -> PartialUtf8CodePoint -> Bool
Eq, Int -> PartialUtf8CodePoint -> ShowS
[PartialUtf8CodePoint] -> ShowS
PartialUtf8CodePoint -> String
(Int -> PartialUtf8CodePoint -> ShowS)
-> (PartialUtf8CodePoint -> String)
-> ([PartialUtf8CodePoint] -> ShowS)
-> Show PartialUtf8CodePoint
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> PartialUtf8CodePoint -> ShowS
showsPrec :: Int -> PartialUtf8CodePoint -> ShowS
$cshow :: PartialUtf8CodePoint -> String
show :: PartialUtf8CodePoint -> String
$cshowList :: [PartialUtf8CodePoint] -> ShowS
showList :: [PartialUtf8CodePoint] -> ShowS
Show)

-- | Empty prefix.
partUtf8Empty :: PartialUtf8CodePoint
partUtf8Empty :: PartialUtf8CodePoint
partUtf8Empty = Word32 -> PartialUtf8CodePoint
PartialUtf8CodePoint Word32
0

-- | Length of the partial code point, stored in the most significant byte.
partUtf8Len :: PartialUtf8CodePoint -> Int
partUtf8Len :: PartialUtf8CodePoint -> Int
partUtf8Len (PartialUtf8CodePoint Word32
w) = Word32 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Word32 -> Int) -> Word32 -> Int
forall a b. (a -> b) -> a -> b
$ Word32
w Word32 -> Int -> Word32
forall a. Bits a => a -> Int -> a
`shiftR` Int
24

-- | Length of the code point once completed (it is known in the first byte).
-- 0 if empty.
partUtf8CompleteLen :: PartialUtf8CodePoint -> Int
partUtf8CompleteLen :: PartialUtf8CodePoint -> Int
partUtf8CompleteLen c :: PartialUtf8CodePoint
c@(PartialUtf8CodePoint Word32
w)
  | PartialUtf8CodePoint -> Int
partUtf8Len PartialUtf8CodePoint
c Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
0 = Int
0
  | Word32
0xf0 Word32 -> Word32 -> Bool
forall a. Ord a => a -> a -> Bool
<= Word32
firstByte = Int
4
  | Word32
0xe0 Word32 -> Word32 -> Bool
forall a. Ord a => a -> a -> Bool
<= Word32
firstByte = Int
3
  | Word32
0xc2 Word32 -> Word32 -> Bool
forall a. Ord a => a -> a -> Bool
<= Word32
firstByte = Int
2
  | Bool
otherwise = Int
0
  where
    firstByte :: Word32
firstByte = (Word32
w Word32 -> Int -> Word32
forall a. Bits a => a -> Int -> a
`shiftR` Int
16) Word32 -> Word32 -> Word32
forall a. Bits a => a -> a -> a
.&. Word32
255

-- | Get the @n@-th byte, assuming it is within bounds: @0 <= n < partUtf8Len c@.
--
-- Unsafe: no bounds checking.
partUtf8UnsafeIndex ::
#if defined(ASSERTS)
  HasCallStack =>
#endif
  PartialUtf8CodePoint -> Int -> Word8
partUtf8UnsafeIndex :: PartialUtf8CodePoint -> Int -> Word8
partUtf8UnsafeIndex _c :: PartialUtf8CodePoint
_c@(PartialUtf8CodePoint Word32
w) Int
n =
#if defined(ASSERTS)
  assert (0 <= n && n < partUtf8Len _c) $
#endif
  Word32 -> Word8
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Word32 -> Word8) -> Word32 -> Word8
forall a b. (a -> b) -> a -> b
$ Word32
w Word32 -> Int -> Word32
forall a. Bits a => a -> Int -> a
`shiftR` (Int
16 Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
8 Int -> Int -> Int
forall a. Num a => a -> a -> a
* Int
n)

-- | Append some bytes.
--
-- Unsafe: no bounds checking.
partUtf8UnsafeAppend ::
#if defined(ASSERTS)
  HasCallStack =>
#endif
  PartialUtf8CodePoint -> ByteString -> PartialUtf8CodePoint
partUtf8UnsafeAppend :: PartialUtf8CodePoint -> ByteString -> PartialUtf8CodePoint
partUtf8UnsafeAppend c :: PartialUtf8CodePoint
c@(PartialUtf8CodePoint Word32
word) ByteString
bs =
#if defined(ASSERTS)
  assert (lenc + lenbs <= 3) $
#endif
  Word32 -> PartialUtf8CodePoint
PartialUtf8CodePoint (Word32 -> PartialUtf8CodePoint) -> Word32 -> PartialUtf8CodePoint
forall a b. (a -> b) -> a -> b
$
    Int -> Word32 -> Word32
forall {a}. (Num a, Bits a) => Int -> a -> a
tryPush Int
0 (Word32 -> Word32) -> Word32 -> Word32
forall a b. (a -> b) -> a -> b
$ Int -> Word32 -> Word32
forall {a}. (Num a, Bits a) => Int -> a -> a
tryPush Int
1 (Word32 -> Word32) -> Word32 -> Word32
forall a b. (a -> b) -> a -> b
$ Int -> Word32 -> Word32
forall {a}. (Num a, Bits a) => Int -> a -> a
tryPush Int
2 (Word32 -> Word32) -> Word32 -> Word32
forall a b. (a -> b) -> a -> b
$ Word32
word Word32 -> Word32 -> Word32
forall a. Num a => a -> a -> a
+ (Int -> Word32
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
lenbs Word32 -> Int -> Word32
forall a. Bits a => a -> Int -> a
`shiftL` Int
24)
  where
    lenc :: Int
lenc = PartialUtf8CodePoint -> Int
partUtf8Len PartialUtf8CodePoint
c
    lenbs :: Int
lenbs = ByteString -> Int
B.length ByteString
bs
    tryPush :: Int -> a -> a
tryPush Int
i a
w =
      if Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
lenbs
      then a
w a -> a -> a
forall a. Num a => a -> a -> a
+ (Word8 -> a
forall a b. (Integral a, Num b) => a -> b
fromIntegral (HasCallStack => ByteString -> Int -> Word8
ByteString -> Int -> Word8
B.index ByteString
bs Int
i) a -> Int -> a
forall a. Bits a => a -> Int -> a
`shiftL` Int -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int
16 Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
8 Int -> Int -> Int
forall a. Num a => a -> a -> a
* (Int
lenc Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
i)))
      else a
w

-- | Fold a 'PartialUtf8CodePoint'. This avoids recursion so it can unfold to straightline code.
{-# INLINE partUtf8Foldr #-}
partUtf8Foldr :: (Word8 -> a -> a) -> a -> PartialUtf8CodePoint -> a
partUtf8Foldr :: forall a. (Word8 -> a -> a) -> a -> PartialUtf8CodePoint -> a
partUtf8Foldr Word8 -> a -> a
f a
x0 PartialUtf8CodePoint
c = case PartialUtf8CodePoint -> Int
partUtf8Len PartialUtf8CodePoint
c of
    Int
0 -> a
x0
    Int
1 -> Int -> a -> a
build Int
0 a
x0
    Int
2 -> Int -> a -> a
build Int
0 (Int -> a -> a
build Int
1 a
x0)
    Int
_ -> Int -> a -> a
build Int
0 (Int -> a -> a
build Int
1 (Int -> a -> a
build Int
2 a
x0))
  where
    build :: Int -> a -> a
build Int
i a
x = Word8 -> a -> a
f (PartialUtf8CodePoint -> Int -> Word8
partUtf8UnsafeIndex PartialUtf8CodePoint
c Int
i) a
x

-- | Convert 'PartialUtf8CodePoint' to 'ByteString'.
partUtf8ToByteString :: PartialUtf8CodePoint -> B.ByteString
partUtf8ToByteString :: PartialUtf8CodePoint -> ByteString
partUtf8ToByteString PartialUtf8CodePoint
c = Int -> (Ptr Word8 -> IO ()) -> ByteString
BI.unsafeCreate (PartialUtf8CodePoint -> Int
partUtf8Len PartialUtf8CodePoint
c) ((Ptr Word8 -> IO ()) -> ByteString)
-> (Ptr Word8 -> IO ()) -> ByteString
forall a b. (a -> b) -> a -> b
$ \Ptr Word8
ptr ->
  (Word8 -> (Int -> IO ()) -> Int -> IO ())
-> (Int -> IO ()) -> PartialUtf8CodePoint -> Int -> IO ()
forall a. (Word8 -> a -> a) -> a -> PartialUtf8CodePoint -> a
partUtf8Foldr (\Word8
w Int -> IO ()
k Int
i -> Ptr Word8 -> Int -> Word8 -> IO ()
forall a. Storable a => Ptr a -> Int -> a -> IO ()
pokeElemOff Ptr Word8
ptr Int
i Word8
w IO () -> IO () -> IO ()
forall a b. IO a -> IO b -> IO b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Int -> IO ()
k (Int
iInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
1)) (\Int
_ -> () -> IO ()
forall a. a -> IO a
forall (f :: * -> *) a. Applicative f => a -> f a
pure ()) PartialUtf8CodePoint
c Int
0

-- | Exported for testing.
getCompleteLen :: Utf8State -> Int
getCompleteLen :: Utf8State -> Int
getCompleteLen = PartialUtf8CodePoint -> Int
partUtf8CompleteLen (PartialUtf8CodePoint -> Int)
-> (Utf8State -> PartialUtf8CodePoint) -> Utf8State -> Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Utf8State -> PartialUtf8CodePoint
partialUtf8CodePoint

-- | Exported for testing.
getPartialUtf8 :: Utf8State -> B.ByteString
getPartialUtf8 :: Utf8State -> ByteString
getPartialUtf8 = PartialUtf8CodePoint -> ByteString
partUtf8ToByteString (PartialUtf8CodePoint -> ByteString)
-> (Utf8State -> PartialUtf8CodePoint) -> Utf8State -> ByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Utf8State -> PartialUtf8CodePoint
partialUtf8CodePoint

#ifdef SIMDUTF
foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8
    :: Ptr Word8 -> CSize -> IO CInt
#endif

-- | Validate a 'ByteString' as UTF-8-encoded text. To be continued using 'validateUtf8More'.
--
-- See also 'validateUtf8More' for details on the result of this function.
--
-- @
-- 'validateUtf8Chunk' = 'validateUtf8More' 'startUtf8State'
-- @
--
-- @since 2.0.2
--
-- === Properties
--
-- Given:
--
-- @
-- 'validateUtf8Chunk' chunk = (n, ms)
-- @
--
-- - The prefix is valid UTF-8. In particular, it should be accepted
--   by this validation:
--
--     @
--     'validateUtf8Chunk' ('Data.ByteString.take' n chunk) = (n, Just 'startUtf8State')
--     @
validateUtf8Chunk :: ByteString -> (Int, Maybe Utf8State)
validateUtf8Chunk :: ByteString -> (Int, Maybe Utf8State)
validateUtf8Chunk ByteString
bs = Int
-> ByteString
-> (Int -> Maybe Utf8State -> (Int, Maybe Utf8State))
-> (Int, Maybe Utf8State)
forall r. Int -> ByteString -> (Int -> Maybe Utf8State -> r) -> r
validateUtf8ChunkFrom Int
0 ByteString
bs (,)

-- Assume bytes up to offset @ofs@ have been validated already.
--
-- Using CPS lets us inline the continuation and avoid allocating a @Maybe@
-- in the @decode...@ functions.
{-# INLINE validateUtf8ChunkFrom #-}
validateUtf8ChunkFrom :: forall r. Int -> ByteString -> (Int -> Maybe Utf8State -> r) -> r
validateUtf8ChunkFrom :: forall r. Int -> ByteString -> (Int -> Maybe Utf8State -> r) -> r
validateUtf8ChunkFrom Int
ofs ByteString
bs Int -> Maybe Utf8State -> r
k
  -- B.isValidUtf8 is buggy before bytestring-0.11.5.3 / bytestring-0.12.1.0.
  -- MIN_VERSION_bytestring does not allow us to differentiate
  -- between 0.11.5.2 and 0.11.5.3 so no choice except demanding 0.12.1+.
#if defined(SIMDUTF) || MIN_VERSION_bytestring(0,12,1)
  | Int
guessUtf8Boundary Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
0 Bool -> Bool -> Bool
&&
    -- the rest of the bytestring is valid utf-8 up to the boundary
    (
#ifdef SIMDUTF
      withBS (B.drop ofs bs) $ \ fp _ -> unsafeDupablePerformIO $
        unsafeWithForeignPtr fp $ \ptr -> (/= 0) <$>
          c_is_valid_utf8 ptr (fromIntegral guessUtf8Boundary)
#else
      ByteString -> Bool
B.isValidUtf8 (ByteString -> Bool) -> ByteString -> Bool
forall a b. (a -> b) -> a -> b
$ Int -> ByteString -> ByteString
B.take Int
guessUtf8Boundary (Int -> ByteString -> ByteString
B.drop Int
ofs ByteString
bs)
#endif
    ) = Int -> r
slowValidateUtf8ChunkFrom (Int
ofs Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
guessUtf8Boundary)
    -- No
  | Bool
otherwise = Int -> r
slowValidateUtf8ChunkFrom Int
ofs
  where
    len :: Int
len = ByteString -> Int
B.length ByteString
bs Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
ofs
    isBoundary :: Int -> (Word8 -> Bool) -> Bool
isBoundary Int
n Word8 -> Bool
p = Int
len Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
n Bool -> Bool -> Bool
&& Word8 -> Bool
p (HasCallStack => ByteString -> Int -> Word8
ByteString -> Int -> Word8
B.index ByteString
bs (Int
ofs Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
len Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
n))
    guessUtf8Boundary :: Int
guessUtf8Boundary
      | Int -> (Word8 -> Bool) -> Bool
isBoundary Int
1 (Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
<= Word8
0x80) = Int
len      -- last char is ASCII (common short-circuit)
      | Int -> (Word8 -> Bool) -> Bool
isBoundary Int
1 (Word8
0xc2 Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
<=) = Int
len Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1  -- last char starts a two-(or more-)byte code point
      | Int -> (Word8 -> Bool) -> Bool
isBoundary Int
2 (Word8
0xe0 Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
<=) = Int
len Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
2  -- pre-last char starts a three-or-four-byte code point
      | Int -> (Word8 -> Bool) -> Bool
isBoundary Int
3 (Word8
0xf0 Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
<=) = Int
len Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
3  -- third to last char starts a four-byte code point
      | Bool
otherwise = Int
len
#else
  = slowValidateUtf8ChunkFrom ofs
  where
#endif
    -- A pure Haskell implementation of validateUtf8More.
    -- Ideally the primitives 'B.isValidUtf8' or 'c_is_valid_utf8' should give us
    -- indices to let us avoid this function.
    slowValidateUtf8ChunkFrom :: Int -> r
    slowValidateUtf8ChunkFrom :: Int -> r
slowValidateUtf8ChunkFrom Int
ofs1 = Int -> Int -> DecoderState -> r
slowLoop Int
ofs1 Int
ofs1 DecoderState
utf8AcceptState

    slowLoop :: Int -> Int -> DecoderState -> r
slowLoop !Int
utf8End Int
i DecoderState
s
      | Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< ByteString -> Int
B.length ByteString
bs =
          case Word8 -> DecoderState -> DecoderState
updateDecoderState (HasCallStack => ByteString -> Int -> Word8
ByteString -> Int -> Word8
B.index ByteString
bs Int
i) DecoderState
s of
            DecoderState
s' | DecoderState
s' DecoderState -> DecoderState -> Bool
forall a. Eq a => a -> a -> Bool
== DecoderState
utf8RejectState -> Int -> Maybe Utf8State -> r
k Int
utf8End Maybe Utf8State
forall a. Maybe a
Nothing
               | DecoderState
s' DecoderState -> DecoderState -> Bool
forall a. Eq a => a -> a -> Bool
== DecoderState
utf8AcceptState -> Int -> Int -> DecoderState -> r
slowLoop (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1) (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1) DecoderState
s'
               | Bool
otherwise -> Int -> Int -> DecoderState -> r
slowLoop Int
utf8End (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1) DecoderState
s'
      | Bool
otherwise = Int -> Maybe Utf8State -> r
k Int
utf8End (Utf8State -> Maybe Utf8State
forall a. a -> Maybe a
Just (DecoderState -> PartialUtf8CodePoint -> Utf8State
Utf8State DecoderState
s (PartialUtf8CodePoint -> ByteString -> PartialUtf8CodePoint
partUtf8UnsafeAppend PartialUtf8CodePoint
partUtf8Empty (Int -> ByteString -> ByteString
B.drop Int
utf8End ByteString
bs))))

-- | Validate another 'ByteString' chunk in an ongoing stream of UTF-8-encoded text.
--
-- Returns a pair:
--
-- 1. The first component @n@ is the end position, relative to the current
--    chunk, of the longest prefix of the accumulated bytestring which is valid UTF-8.
--    @n@ may be negative: that happens when an incomplete code point started in
--    a previous chunk and is not completed by the current chunk (either
--    that code point is still incomplete, or it is broken by an invalid byte).
--
-- 2. The second component @ms@ indicates the following:
--
--     - if @ms = Nothing@, the remainder of the chunk contains an invalid byte,
--       within four bytes from position @n@;
--     - if @ms = Just s'@, you can carry on validating another chunk
--       by calling 'validateUtf8More' with the new state @s'@.
--
-- @since 2.0.2
--
-- === Properties
--
-- Given:
--
-- @
-- 'validateUtf8More' s chunk = (n, ms)
-- @
--
-- - If the chunk is invalid, it cannot be extended to be valid.
--
--     @
--     ms = Nothing
--     ==> 'validateUtf8More' s (chunk '<>' more) = (n, Nothing)
--     @
--
-- - Validating two chunks sequentially is the same as validating them
--   together at once:
--
--     @
--     ms = Just s'
--     ==> 'validateUtf8More' s (chunk '<>' more) = 'Data.Bifunctor.first' ('Data.ByteString.length' chunk '+') ('validateUtf8More' s' more)
--     @
validateUtf8More :: Utf8State -> ByteString -> (Int, Maybe Utf8State)
validateUtf8More :: Utf8State -> ByteString -> (Int, Maybe Utf8State)
validateUtf8More Utf8State
st ByteString
bs = Utf8State
-> ByteString
-> (Int -> Maybe Utf8State -> (Int, Maybe Utf8State))
-> (Int, Maybe Utf8State)
forall r.
Utf8State -> ByteString -> (Int -> Maybe Utf8State -> r) -> r
validateUtf8MoreCont Utf8State
st ByteString
bs (,)

-- CPS: inlining the continuation lets us make more tail calls and avoid
-- allocating a @Maybe@ in @decodeWith1/2@.
{-# INLINE validateUtf8MoreCont #-}
validateUtf8MoreCont :: Utf8State -> ByteString -> (Int -> Maybe Utf8State -> r) -> r
validateUtf8MoreCont :: forall r.
Utf8State -> ByteString -> (Int -> Maybe Utf8State -> r) -> r
validateUtf8MoreCont st :: Utf8State
st@(Utf8State DecoderState
s0 PartialUtf8CodePoint
part) ByteString
bs Int -> Maybe Utf8State -> r
k
  | Int
len Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
0 = Int -> DecoderState -> r
loop Int
0 DecoderState
s0
  | Bool
otherwise = Int -> Maybe Utf8State -> r
k (- PartialUtf8CodePoint -> Int
partUtf8Len PartialUtf8CodePoint
part) (Utf8State -> Maybe Utf8State
forall a. a -> Maybe a
Just Utf8State
st)
  where
    len :: Int
len = ByteString -> Int
B.length ByteString
bs
    -- Complete an incomplete code point (if there is one)
    -- and then jump to validateUtf8ChunkFrom
    loop :: Int -> DecoderState -> r
loop !Int
i DecoderState
s
      | DecoderState
s DecoderState -> DecoderState -> Bool
forall a. Eq a => a -> a -> Bool
== DecoderState
utf8AcceptState = Int -> ByteString -> (Int -> Maybe Utf8State -> r) -> r
forall r. Int -> ByteString -> (Int -> Maybe Utf8State -> r) -> r
validateUtf8ChunkFrom Int
i ByteString
bs Int -> Maybe Utf8State -> r
k
      | Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
len =
        case Word8 -> DecoderState -> DecoderState
updateDecoderState (HasCallStack => ByteString -> Int -> Word8
ByteString -> Int -> Word8
B.index ByteString
bs Int
i) DecoderState
s of
          DecoderState
s' | DecoderState
s' DecoderState -> DecoderState -> Bool
forall a. Eq a => a -> a -> Bool
== DecoderState
utf8RejectState -> Int -> Maybe Utf8State -> r
k (- PartialUtf8CodePoint -> Int
partUtf8Len PartialUtf8CodePoint
part) Maybe Utf8State
forall a. Maybe a
Nothing
             | Bool
otherwise -> Int -> DecoderState -> r
loop (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1) DecoderState
s'
      | Bool
otherwise = Int -> Maybe Utf8State -> r
k (- PartialUtf8CodePoint -> Int
partUtf8Len PartialUtf8CodePoint
part) (Utf8State -> Maybe Utf8State
forall a. a -> Maybe a
Just (DecoderState -> PartialUtf8CodePoint -> Utf8State
Utf8State DecoderState
s (PartialUtf8CodePoint -> ByteString -> PartialUtf8CodePoint
partUtf8UnsafeAppend PartialUtf8CodePoint
part ByteString
bs)))

-- Eta-expanded to inline partUtf8Foldr
partUtf8ToStrictBuilder :: PartialUtf8CodePoint -> StrictBuilder
partUtf8ToStrictBuilder :: PartialUtf8CodePoint -> StrictBuilder
partUtf8ToStrictBuilder PartialUtf8CodePoint
c =
  (Word8 -> StrictBuilder -> StrictBuilder)
-> StrictBuilder -> PartialUtf8CodePoint -> StrictBuilder
forall a. (Word8 -> a -> a) -> a -> PartialUtf8CodePoint -> a
partUtf8Foldr (StrictBuilder -> StrictBuilder -> StrictBuilder
forall a. Semigroup a => a -> a -> a
(<>) (StrictBuilder -> StrictBuilder -> StrictBuilder)
-> (Word8 -> StrictBuilder)
-> Word8
-> StrictBuilder
-> StrictBuilder
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Word8 -> StrictBuilder
SB.unsafeFromWord8) StrictBuilder
forall a. Monoid a => a
mempty PartialUtf8CodePoint
c

utf8StateToStrictBuilder :: Utf8State -> StrictBuilder
utf8StateToStrictBuilder :: Utf8State -> StrictBuilder
utf8StateToStrictBuilder = PartialUtf8CodePoint -> StrictBuilder
partUtf8ToStrictBuilder (PartialUtf8CodePoint -> StrictBuilder)
-> (Utf8State -> PartialUtf8CodePoint)
-> Utf8State
-> StrictBuilder
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Utf8State -> PartialUtf8CodePoint
partialUtf8CodePoint

-- | Decode another chunk in an ongoing UTF-8 stream.
--
-- Returns a triple:
--
-- 1. A 'StrictBuilder' for the decoded chunk of text. You can accumulate
--    chunks with @('<>')@ or output them with 'SB.toText'.
-- 2. The undecoded remainder of the given chunk, for diagnosing errors
--    and resuming (presumably after skipping some bytes).
-- 3. 'Just' the new state, or 'Nothing' if an invalid byte was encountered
--    (it will be within the first 4 bytes of the undecoded remainder).
--
-- @since 2.0.2
--
-- === Properties
--
-- Given:
--
-- @
-- (pre, suf, ms) = 'decodeUtf8More' s chunk
-- @
--
-- 1. If the output @pre@ is nonempty (alternatively, if @length chunk > length suf@)
--
--     @
--     s2b pre \`'Data.ByteString.append'\` suf = p2b s \`'Data.ByteString.append'\` chunk
--     @
--
--     where
--
--     @
--     s2b = 'Data.Text.Encoding.encodeUtf8' . 'Data.Text.Encoding.toText'
--     p2b = 'Data.Text.Internal.Encoding.partUtf8ToByteString'
--     @
--
-- 2. If the output @pre@ is empty (alternatively, if @length chunk = length suf@)
--
--     @suf = chunk@
--
-- 3. Decoding chunks separately is equivalent to decoding their concatenation.
--
--     Given:
--
--     @
--     (pre1, suf1, Just s1) = 'decodeUtf8More' s chunk1
--     (pre2, suf2,     ms2) = 'decodeUtf8More' s1 chunk2
--     (pre3, suf3,     ms3) = 'decodeUtf8More' s (chunk1 \`B.append\` chunk2)
--     @
--
--     we have:
--
--     @
--     s2b (pre1 '<>' pre2) = s2b pre3
--     ms2 = ms3
--     @
decodeUtf8More :: Utf8State -> ByteString -> (StrictBuilder, ByteString, Maybe Utf8State)
decodeUtf8More :: Utf8State
-> ByteString -> (StrictBuilder, ByteString, Maybe Utf8State)
decodeUtf8More Utf8State
s ByteString
bs =
  Utf8State
-> ByteString
-> (Int
    -> Maybe Utf8State -> (StrictBuilder, ByteString, Maybe Utf8State))
-> (StrictBuilder, ByteString, Maybe Utf8State)
forall r.
Utf8State -> ByteString -> (Int -> Maybe Utf8State -> r) -> r
validateUtf8MoreCont Utf8State
s ByteString
bs ((Int
  -> Maybe Utf8State -> (StrictBuilder, ByteString, Maybe Utf8State))
 -> (StrictBuilder, ByteString, Maybe Utf8State))
-> (Int
    -> Maybe Utf8State -> (StrictBuilder, ByteString, Maybe Utf8State))
-> (StrictBuilder, ByteString, Maybe Utf8State)
forall a b. (a -> b) -> a -> b
$ \Int
len Maybe Utf8State
ms ->
    let builder :: StrictBuilder
builder | Int
len Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
0 = StrictBuilder
forall a. Monoid a => a
mempty
                | Bool
otherwise = Utf8State -> StrictBuilder
utf8StateToStrictBuilder Utf8State
s
                  StrictBuilder -> StrictBuilder -> StrictBuilder
forall a. Semigroup a => a -> a -> a
<> ByteString -> StrictBuilder
SB.unsafeFromByteString (Int -> ByteString -> ByteString
B.take Int
len ByteString
bs)
    in (StrictBuilder
builder, Int -> ByteString -> ByteString
B.drop Int
len ByteString
bs, Maybe Utf8State
ms)

-- | Decode a chunk of UTF-8 text. To be continued with 'decodeUtf8More'.
--
-- See 'decodeUtf8More' for details on the result.
--
-- @since 2.0.2
--
-- === Properties
--
-- @
-- 'decodeUtf8Chunk' = 'decodeUtf8More' 'startUtf8State'
-- @
--
-- Given:
--
-- @
-- 'decodeUtf8Chunk' chunk = (builder, rest, ms)
-- @
--
-- @builder@ is a prefix and @rest@ is a suffix of @chunk@.
--
-- @
-- 'Data.Text.Encoding.encodeUtf8' ('Data.Text.Encoding.strictBuilderToText' builder) '<>' rest = chunk
-- @
decodeUtf8Chunk :: ByteString -> (StrictBuilder, ByteString, Maybe Utf8State)
decodeUtf8Chunk :: ByteString -> (StrictBuilder, ByteString, Maybe Utf8State)
decodeUtf8Chunk = Utf8State
-> ByteString -> (StrictBuilder, ByteString, Maybe Utf8State)
decodeUtf8More Utf8State
startUtf8State

-- | Call the error handler on each byte of the partial code point stored in
-- 'Utf8State' and append the results.
--
-- Exported for use in lazy 'Data.Text.Lazy.Encoding.decodeUtf8With'.
--
-- @since 2.0.2
{-# INLINE skipIncomplete #-}
skipIncomplete :: OnDecodeError -> String -> Utf8State -> StrictBuilder
skipIncomplete :: OnDecodeError -> String -> Utf8State -> StrictBuilder
skipIncomplete OnDecodeError
onErr String
msg Utf8State
s =
  (Word8 -> StrictBuilder -> StrictBuilder)
-> StrictBuilder -> PartialUtf8CodePoint -> StrictBuilder
forall a. (Word8 -> a -> a) -> a -> PartialUtf8CodePoint -> a
partUtf8Foldr
    (StrictBuilder -> StrictBuilder -> StrictBuilder
forall a. Semigroup a => a -> a -> a
(<>) (StrictBuilder -> StrictBuilder -> StrictBuilder)
-> (Word8 -> StrictBuilder)
-> Word8
-> StrictBuilder
-> StrictBuilder
forall b c a. (b -> c) -> (a -> b) -> a -> c
. OnDecodeError -> String -> Word8 -> StrictBuilder
handleUtf8Error OnDecodeError
onErr String
msg)
    StrictBuilder
forall a. Monoid a => a
mempty (Utf8State -> PartialUtf8CodePoint
partialUtf8CodePoint Utf8State
s)

{-# INLINE handleUtf8Error #-}
handleUtf8Error :: OnDecodeError -> String -> Word8 -> StrictBuilder
handleUtf8Error :: OnDecodeError -> String -> Word8 -> StrictBuilder
handleUtf8Error OnDecodeError
onErr String
msg Word8
w = case OnDecodeError
onErr String
msg (Word8 -> Maybe Word8
forall a. a -> Maybe a
Just Word8
w) of
  Just Char
c -> Char -> StrictBuilder
SB.fromChar Char
c
  Maybe Char
Nothing -> StrictBuilder
forall a. Monoid a => a
mempty

-- | Helper for 'Data.Text.Encoding.decodeUtf8With'.
--
-- @since 2.0.2

-- This could be shorter by calling 'decodeUtf8With2' directly, but we make the
-- first call validateUtf8Chunk directly to return even faster in successful
-- cases.
decodeUtf8With1 ::
#if defined(ASSERTS)
  HasCallStack =>
#endif
  OnDecodeError -> String -> ByteString -> Text
decodeUtf8With1 :: OnDecodeError -> String -> ByteString -> Text
decodeUtf8With1 OnDecodeError
onErr String
msg ByteString
bs = Int -> ByteString -> (Int -> Maybe Utf8State -> Text) -> Text
forall r. Int -> ByteString -> (Int -> Maybe Utf8State -> r) -> r
validateUtf8ChunkFrom Int
0 ByteString
bs ((Int -> Maybe Utf8State -> Text) -> Text)
-> (Int -> Maybe Utf8State -> Text) -> Text
forall a b. (a -> b) -> a -> b
$ \Int
len Maybe Utf8State
ms -> case Maybe Utf8State
ms of
    Just Utf8State
s
      | Int
len Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== ByteString -> Int
B.length ByteString
bs ->
        let !(SBS.SBS ByteArray#
arr) = ByteString -> ShortByteString
SBS.toShort ByteString
bs in
        Array -> Int -> Int -> Text
Text (ByteArray# -> Array
A.ByteArray ByteArray#
arr) Int
0 Int
len
      | Bool
otherwise -> StrictBuilder -> Text
SB.toText (StrictBuilder -> Text) -> StrictBuilder -> Text
forall a b. (a -> b) -> a -> b
$
          ByteString -> StrictBuilder
SB.unsafeFromByteString (Int -> ByteString -> ByteString
B.take Int
len ByteString
bs) StrictBuilder -> StrictBuilder -> StrictBuilder
forall a. Semigroup a => a -> a -> a
<> OnDecodeError -> String -> Utf8State -> StrictBuilder
skipIncomplete OnDecodeError
onErr String
msg Utf8State
s
    Maybe Utf8State
Nothing ->
       let (StrictBuilder
builder, ByteString
_, Utf8State
s) = OnDecodeError
-> String
-> Utf8State
-> ByteString
-> (StrictBuilder, ByteString, Utf8State)
decodeUtf8With2 OnDecodeError
onErr String
msg Utf8State
startUtf8State (Int -> ByteString -> ByteString
B.drop (Int
len Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1) ByteString
bs) in
       StrictBuilder -> Text
SB.toText (StrictBuilder -> Text) -> StrictBuilder -> Text
forall a b. (a -> b) -> a -> b
$
         ByteString -> StrictBuilder
SB.unsafeFromByteString (Int -> ByteString -> ByteString
B.take Int
len ByteString
bs) StrictBuilder -> StrictBuilder -> StrictBuilder
forall a. Semigroup a => a -> a -> a
<>
         OnDecodeError -> String -> Word8 -> StrictBuilder
handleUtf8Error OnDecodeError
onErr String
msg (HasCallStack => ByteString -> Int -> Word8
ByteString -> Int -> Word8
B.index ByteString
bs Int
len) StrictBuilder -> StrictBuilder -> StrictBuilder
forall a. Semigroup a => a -> a -> a
<>
         StrictBuilder
builder StrictBuilder -> StrictBuilder -> StrictBuilder
forall a. Semigroup a => a -> a -> a
<>
         OnDecodeError -> String -> Utf8State -> StrictBuilder
skipIncomplete OnDecodeError
onErr String
msg Utf8State
s

-- | Helper for 'Data.Text.Encoding.decodeUtf8With',
-- 'Data.Text.Encoding.streamDecodeUtf8With', and lazy
-- 'Data.Text.Lazy.Encoding.decodeUtf8With',
-- which use an 'OnDecodeError' to process bad bytes.
--
-- See 'decodeUtf8Chunk' for a more flexible alternative.
--
-- @since 2.0.2
decodeUtf8With2 ::
#if defined(ASSERTS)
  HasCallStack =>
#endif
  OnDecodeError -> String -> Utf8State -> ByteString -> (StrictBuilder, ByteString, Utf8State)
decodeUtf8With2 :: OnDecodeError
-> String
-> Utf8State
-> ByteString
-> (StrictBuilder, ByteString, Utf8State)
decodeUtf8With2 OnDecodeError
onErr String
msg Utf8State
s0 ByteString
bs = Utf8State
-> Int -> StrictBuilder -> (StrictBuilder, ByteString, Utf8State)
loop Utf8State
s0 Int
0 StrictBuilder
forall a. Monoid a => a
mempty
  where
    loop :: Utf8State
-> Int -> StrictBuilder -> (StrictBuilder, ByteString, Utf8State)
loop Utf8State
s Int
i !StrictBuilder
builder =
      let nonEmptyPrefix :: Int -> StrictBuilder
nonEmptyPrefix Int
len = StrictBuilder
builder
            StrictBuilder -> StrictBuilder -> StrictBuilder
forall a. Semigroup a => a -> a -> a
<> Utf8State -> StrictBuilder
utf8StateToStrictBuilder Utf8State
s
            StrictBuilder -> StrictBuilder -> StrictBuilder
forall a. Semigroup a => a -> a -> a
<> ByteString -> StrictBuilder
SB.unsafeFromByteString (Int -> ByteString -> ByteString
B.take Int
len (Int -> ByteString -> ByteString
B.drop Int
i ByteString
bs))
      in Utf8State
-> ByteString
-> (Int
    -> Maybe Utf8State -> (StrictBuilder, ByteString, Utf8State))
-> (StrictBuilder, ByteString, Utf8State)
forall r.
Utf8State -> ByteString -> (Int -> Maybe Utf8State -> r) -> r
validateUtf8MoreCont Utf8State
s (Int -> ByteString -> ByteString
B.drop Int
i ByteString
bs) ((Int -> Maybe Utf8State -> (StrictBuilder, ByteString, Utf8State))
 -> (StrictBuilder, ByteString, Utf8State))
-> (Int
    -> Maybe Utf8State -> (StrictBuilder, ByteString, Utf8State))
-> (StrictBuilder, ByteString, Utf8State)
forall a b. (a -> b) -> a -> b
$ \Int
len Maybe Utf8State
ms -> case Maybe Utf8State
ms of
        Maybe Utf8State
Nothing ->
          if Int
len Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
0
          then
            -- If the first byte cannot complete the partial code point in s,
            -- retry from startUtf8State.
            let builder' :: StrictBuilder
builder' = StrictBuilder
builder StrictBuilder -> StrictBuilder -> StrictBuilder
forall a. Semigroup a => a -> a -> a
<> OnDecodeError -> String -> Utf8State -> StrictBuilder
skipIncomplete OnDecodeError
onErr String
msg Utf8State
s
            -- Note: loop is strict on builder, so if onErr raises an error it will
            -- be forced here, short-circuiting the loop as desired.
            in Utf8State
-> Int -> StrictBuilder -> (StrictBuilder, ByteString, Utf8State)
loop Utf8State
startUtf8State Int
i StrictBuilder
builder'
          else
            let builder' :: StrictBuilder
builder' = Int -> StrictBuilder
nonEmptyPrefix Int
len
                  StrictBuilder -> StrictBuilder -> StrictBuilder
forall a. Semigroup a => a -> a -> a
<> OnDecodeError -> String -> Word8 -> StrictBuilder
handleUtf8Error OnDecodeError
onErr String
msg (HasCallStack => ByteString -> Int -> Word8
ByteString -> Int -> Word8
B.index ByteString
bs (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
len))
            in Utf8State
-> Int -> StrictBuilder -> (StrictBuilder, ByteString, Utf8State)
loop Utf8State
startUtf8State (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
len Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1) StrictBuilder
builder'
        Just Utf8State
s' ->
          let builder' :: StrictBuilder
builder' = if Int
len Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
0 then StrictBuilder
builder else Int -> StrictBuilder
nonEmptyPrefix Int
len
              undecoded :: ByteString
undecoded = if ByteString -> Int
B.length ByteString
bs Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= PartialUtf8CodePoint -> Int
partUtf8Len (Utf8State -> PartialUtf8CodePoint
partialUtf8CodePoint Utf8State
s')
                then Int -> ByteString -> ByteString
B.drop (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
len) ByteString
bs  -- Reuse bs if possible
                else PartialUtf8CodePoint -> ByteString
partUtf8ToByteString (Utf8State -> PartialUtf8CodePoint
partialUtf8CodePoint Utf8State
s')
          in (StrictBuilder
builder', ByteString
undecoded, Utf8State
s')