-- |
-- Copyright:   (c) 2021-2022 Andrew Lelechenko
-- Licence:     BSD3
-- Maintainer:  Andrew Lelechenko <andrew.lelechenko@gmail.com>

{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE CPP #-}
{-# LANGUAGE MagicHash #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE UnliftedFFITypes #-}

module Data.Text.Utf16.Lines
  ( I.TextLines
  , I.fromText
  , I.toText
  , I.null
  -- * Lines
  , I.lines
  , I.lengthInLines
  , I.splitAtLine
  -- * UTF-16 code units
  , length
  , splitAt
  , Position(..)
  , lengthAsPosition
  , splitAtPosition
  ) where

import Prelude ((+), (-), seq)
import Control.DeepSeq (NFData, rnf)
import Data.Bool (otherwise)
import Data.Eq (Eq, (==))
import Data.Function ((.), ($))
import Data.Maybe (Maybe(..))
import Data.Monoid (Monoid(..))
import Data.Ord (Ord, (<=), (>), (>=))
import Data.Semigroup (Semigroup(..))
import qualified Data.Text.Array as TA
import Data.Text.Internal (Text(..))
import qualified Data.Text.Lines.Internal as I
import qualified Data.Vector.Unboxed as U
import Data.Word (Word)
import Text.Show (Show)

#if MIN_VERSION_text(2,0,0)
import Prelude (fromIntegral)
import Foreign.C.Types (CSize(..))
import GHC.Exts (ByteArray#)
import System.IO (IO)
import System.IO.Unsafe (unsafeDupablePerformIO)
import System.Posix.Types (CSsize(..))
#else
import Data.Bool ((&&))
import Data.Ord ((<))
#endif

#ifdef DEBUG
import GHC.Stack (HasCallStack)
#else
#define HasCallStack ()
#endif

lengthTextUtf16 :: Text -> Word
#if MIN_VERSION_text(2,0,0)
lengthTextUtf16 :: Text -> Word
lengthTextUtf16 (Text (TA.ByteArray ByteArray#
arr) Int
off Int
len) = CSsize -> Word
forall a b. (Integral a, Num b) => a -> b
fromIntegral (CSsize -> Word) -> CSsize -> Word
forall a b. (a -> b) -> a -> b
$ IO CSsize -> CSsize
forall a. IO a -> a
unsafeDupablePerformIO (IO CSsize -> CSsize) -> IO CSsize -> CSsize
forall a b. (a -> b) -> a -> b
$
  ByteArray# -> CSize -> CSize -> IO CSsize
lengthUtf8AsUtf16 ByteArray#
arr (Int -> CSize
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
off) (Int -> CSize
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
len)

foreign import ccall unsafe "_hs_text_lines_length_utf8_as_utf16" lengthUtf8AsUtf16
  :: ByteArray# -> CSize -> CSize -> IO CSsize
#else
lengthTextUtf16 (Text _ _ len) = I.intToWord len
#endif

-- | Length in UTF-16 code units.
-- Takes linear time.
--
-- >>> :set -XOverloadedStrings
-- >>> length "fя𐀀"
-- 4
-- >>> Data.Text.Lines.length "fя𐀀"
-- 3
--
length :: I.TextLines -> Word
length :: TextLines -> Word
length = Text -> Word
lengthTextUtf16 (Text -> Word) -> (TextLines -> Text) -> TextLines -> Word
forall b c a. (b -> c) -> (a -> b) -> a -> c
. TextLines -> Text
I.toText

-- | Represent a position in a text.
data Position = Position
  { Position -> Word
posLine   :: !Word -- ^ Line.
  , Position -> Word
posColumn :: !Word -- ^ Column in UTF-16 code units.
  } deriving (Position -> Position -> Bool
(Position -> Position -> Bool)
-> (Position -> Position -> Bool) -> Eq Position
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: Position -> Position -> Bool
== :: Position -> Position -> Bool
$c/= :: Position -> Position -> Bool
/= :: Position -> Position -> Bool
Eq, Eq Position
Eq Position =>
(Position -> Position -> Ordering)
-> (Position -> Position -> Bool)
-> (Position -> Position -> Bool)
-> (Position -> Position -> Bool)
-> (Position -> Position -> Bool)
-> (Position -> Position -> Position)
-> (Position -> Position -> Position)
-> Ord Position
Position -> Position -> Bool
Position -> Position -> Ordering
Position -> Position -> Position
forall a.
Eq a =>
(a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
$ccompare :: Position -> Position -> Ordering
compare :: Position -> Position -> Ordering
$c< :: Position -> Position -> Bool
< :: Position -> Position -> Bool
$c<= :: Position -> Position -> Bool
<= :: Position -> Position -> Bool
$c> :: Position -> Position -> Bool
> :: Position -> Position -> Bool
$c>= :: Position -> Position -> Bool
>= :: Position -> Position -> Bool
$cmax :: Position -> Position -> Position
max :: Position -> Position -> Position
$cmin :: Position -> Position -> Position
min :: Position -> Position -> Position
Ord, Int -> Position -> ShowS
[Position] -> ShowS
Position -> String
(Int -> Position -> ShowS)
-> (Position -> String) -> ([Position] -> ShowS) -> Show Position
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> Position -> ShowS
showsPrec :: Int -> Position -> ShowS
$cshow :: Position -> String
show :: Position -> String
$cshowList :: [Position] -> ShowS
showList :: [Position] -> ShowS
Show)

instance NFData Position where
  rnf :: Position -> ()
rnf = (Position -> () -> ()
forall a b. a -> b -> b
`seq` ())

-- | Associativity does not hold when 'posLine' overflows.
instance Semigroup Position where
  Position Word
l1 Word
c1 <> :: Position -> Position -> Position
<> Position Word
l2 Word
c2 =
    Word -> Word -> Position
Position (Word
l1 Word -> Word -> Word
forall a. Num a => a -> a -> a
+ Word
l2) (if Word
l2 Word -> Word -> Bool
forall a. Eq a => a -> a -> Bool
== Word
0 then Word
c1 Word -> Word -> Word
forall a. Num a => a -> a -> a
+ Word
c2 else Word
c2)

instance Monoid Position where
  mempty :: Position
mempty = Word -> Word -> Position
Position Word
0 Word
0
  mappend :: Position -> Position -> Position
mappend = Position -> Position -> Position
forall a. Semigroup a => a -> a -> a
(<>)

-- | Measure text length as an amount of lines and columns.
-- Time is proportional to the length of the last line.
--
-- >>> :set -XOverloadedStrings
-- >>> lengthAsPosition "f𐀀"
-- Position {posLine = 0, posColumn = 3}
-- >>> lengthAsPosition "f\n𐀀"
-- Position {posLine = 1, posColumn = 2}
-- >>> lengthAsPosition "f\n𐀀\n"
-- Position {posLine = 2, posColumn = 0}
--
lengthAsPosition
  :: I.TextLines
  -> Position
lengthAsPosition :: TextLines -> Position
lengthAsPosition (I.TextLines (Text ByteArray
arr Int
off Int
len) Vector Int
nls) = Position
  { posLine :: Word
posLine = Int -> Word
I.intToWord (Int -> Word) -> Int -> Word
forall a b. (a -> b) -> a -> b
$ Vector Int -> Int
forall a. Unbox a => Vector a -> Int
U.length Vector Int
nls
  , posColumn :: Word
posColumn = Text -> Word
lengthTextUtf16 (Text -> Word) -> Text -> Word
forall a b. (a -> b) -> a -> b
$ ByteArray -> Int -> Int -> Text
Text ByteArray
arr Int
nl (Int
off Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
len Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
nl)
  }
  where
    nl :: Int
nl = if Vector Int -> Bool
forall a. Unbox a => Vector a -> Bool
U.null Vector Int
nls then Int
off else Vector Int -> Int
forall a. Unbox a => Vector a -> a
U.last Vector Int
nls Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1

splitTextAtUtf16Index :: Word -> Text -> Maybe (Text, Text)
splitTextAtUtf16Index :: Word -> Text -> Maybe (Text, Text)
splitTextAtUtf16Index Word
k t :: Text
t@(Text ByteArray
arr Int
off Int
len)
  | Word
k Word -> Word -> Bool
forall a. Ord a => a -> a -> Bool
<= Word
0 = (Text, Text) -> Maybe (Text, Text)
forall a. a -> Maybe a
Just (ByteArray -> Int -> Int -> Text
Text ByteArray
arr Int
off Int
0, Text
t)
  | Word
k Word -> Word -> Bool
forall a. Ord a => a -> a -> Bool
>= Int -> Word
I.intToWord Int
len = (Text, Text) -> Maybe (Text, Text)
forall a. a -> Maybe a
Just (Text
t, Text
forall a. Monoid a => a
mempty)
#if MIN_VERSION_text(2,0,0)
  | Int
o Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
0 = (Text, Text) -> Maybe (Text, Text)
forall a. a -> Maybe a
Just (ByteArray -> Int -> Int -> Text
Text ByteArray
arr Int
off Int
o, ByteArray -> Int -> Int -> Text
Text ByteArray
arr (Int
off Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
o) (Int
len Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
o))
  | Bool
otherwise = Maybe (Text, Text)
forall a. Maybe a
Nothing
    where
      !(TA.ByteArray ByteArray#
arr#) = ByteArray
arr
      o :: Int
o = CSsize -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral (CSsize -> Int) -> CSsize -> Int
forall a b. (a -> b) -> a -> b
$ IO CSsize -> CSsize
forall a. IO a -> a
unsafeDupablePerformIO (IO CSsize -> CSsize) -> IO CSsize -> CSsize
forall a b. (a -> b) -> a -> b
$
        ByteArray# -> CSize -> CSize -> CSize -> IO CSsize
takeUtf8AsUtf16 ByteArray#
arr# (Int -> CSize
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
off) (Int -> CSize
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
len) (Word -> CSize
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word
k)

foreign import ccall unsafe "_hs_text_lines_take_utf8_as_utf16" takeUtf8AsUtf16
  :: ByteArray# -> CSize -> CSize -> CSize -> IO CSsize
#else
  -- Something wrong is going here:
  | otherwise = if c >= 0xdc00 && c < 0xe000 then Nothing else Just
    (Text arr off k', Text arr (off + k') (len - k'))
    where
      k' = I.wordToInt k
      c = TA.unsafeIndex arr (off + k')
#endif

-- | Combination of 'I.splitAtLine' and subsequent 'splitAt'.
-- If requested number of code units splits a code point in half, return 'Nothing'.
-- Time is linear in 'posColumn', but does not depend on 'posLine'.
--
-- >>> :set -XOverloadedStrings
-- >>> splitAtPosition (Position 1 0) "f\n𐀀я"
-- Just ("f\n","𐀀я")
-- >>> splitAtPosition (Position 1 1) "f\n𐀀я"
-- Nothing
-- >>> splitAtPosition (Position 1 2) "f\n𐀀я"
-- Just ("f\n𐀀","я")
-- >>> splitAtPosition (Position 0 2) "f\n𐀀я"
-- Just ("f\n","𐀀я")
-- >>> splitAtPosition (Position 0 3) "f\n𐀀я"
-- Nothing
-- >>> splitAtPosition (Position 0 4) "f\n𐀀я"
-- Just ("f\n𐀀","я")
--
splitAtPosition
  :: HasCallStack
  => Position
  -> I.TextLines
  -> Maybe (I.TextLines, I.TextLines)
splitAtPosition :: Position -> TextLines -> Maybe (TextLines, TextLines)
splitAtPosition (Position Word
line Word
column) (I.TextLines (Text ByteArray
arr Int
off Int
len) Vector Int
nls) =
  case Word -> Text -> Maybe (Text, Text)
splitTextAtUtf16Index Word
column Text
tx of
    Maybe (Text, Text)
Nothing -> Maybe (TextLines, TextLines)
forall a. Maybe a
Nothing
    Just (Text ByteArray
_ Int
off' Int
len', Text
tz) -> let n :: Int
n = Vector Int -> Int -> Int
forall a. (Ord a, Unbox a) => Vector a -> a -> Int
I.binarySearch Vector Int
nls (Int
off' Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
len') in (TextLines, TextLines) -> Maybe (TextLines, TextLines)
forall a. a -> Maybe a
Just
      ( Text -> Vector Int -> TextLines
I.textLines (ByteArray -> Int -> Int -> Text
Text ByteArray
arr Int
off (Int
off' Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
len' Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
off)) (Int -> Vector Int -> Vector Int
forall a. Unbox a => Int -> Vector a -> Vector a
U.take Int
n Vector Int
nls)
      , Text -> Vector Int -> TextLines
I.textLines Text
tz (Int -> Vector Int -> Vector Int
forall a. Unbox a => Int -> Vector a -> Vector a
U.drop Int
n Vector Int
nls))
  where
    arrLen :: Int
arrLen = Int
off Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
len
    nl :: Int
nl
      | Word
line Word -> Word -> Bool
forall a. Ord a => a -> a -> Bool
<= Word
0 = Int
off
      | Word
line Word -> Word -> Bool
forall a. Ord a => a -> a -> Bool
> Int -> Word
I.intToWord (Vector Int -> Int
forall a. Unbox a => Vector a -> Int
U.length Vector Int
nls) = Int
arrLen
      | Bool
otherwise = Vector Int
nls Vector Int -> Int -> Int
forall a. Unbox a => Vector a -> Int -> a
U.! (Word -> Int
I.wordToInt Word
line Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1) Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1
    tx :: Text
tx = ByteArray -> Int -> Int -> Text
Text ByteArray
arr Int
nl (Int
arrLen Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
nl)

-- | Split at given UTF-16 code unit.
-- If requested number of code units splits a code point in half, return 'Nothing'.
-- Takes linear time.
--
-- >>> :set -XOverloadedStrings
-- >>> map (\c -> splitAt c "fя𐀀") [0..4]
-- [Just ("","fя𐀀"),Just ("f","я𐀀"),Just ("fя","𐀀"),Nothing,Just ("fя𐀀","")]
--
splitAt :: HasCallStack => Word -> I.TextLines -> Maybe (I.TextLines, I.TextLines)
splitAt :: Word -> TextLines -> Maybe (TextLines, TextLines)
splitAt = Position -> TextLines -> Maybe (TextLines, TextLines)
splitAtPosition (Position -> TextLines -> Maybe (TextLines, TextLines))
-> (Word -> Position)
-> Word
-> TextLines
-> Maybe (TextLines, TextLines)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Word -> Word -> Position
Position Word
0