{-# LANGUAGE CPP #-}
module Codec.Text.IConv (
convert,
EncodingName,
convertFuzzy,
Fuzzy(..),
convertStrictly,
convertLazily,
ConversionError(..),
reportConversionError,
Span(..),
) where
import Prelude hiding (length, span)
import Control.Exception (assert)
import qualified Control.Exception as Exception
import Foreign.C.Error as C.Error (Errno, errnoToIOError)
import qualified Data.ByteString.Lazy as L (ByteString, toChunks, fromChunks)
import qualified Data.ByteString.Lazy.Internal as L (defaultChunkSize)
import qualified Data.ByteString as S
import qualified Codec.Text.IConv.Internal as IConv
import Codec.Text.IConv.Internal (IConv)
type EncodingName = String
data Span =
Span !S.ByteString
| ConversionError !ConversionError
data ConversionError =
UnsuportedConversion EncodingName EncodingName
| InvalidChar Int
| IncompleteChar Int
| UnexpectedError C.Error.Errno
reportConversionError :: ConversionError -> IOError
reportConversionError :: ConversionError -> IOError
reportConversionError ConversionError
conversionError = case ConversionError
conversionError of
UnsuportedConversion EncodingName
fromEncoding EncodingName
toEncoding
-> EncodingName -> IOError
err (EncodingName -> IOError) -> EncodingName -> IOError
forall a b. (a -> b) -> a -> b
$ EncodingName
"cannot convert from string encoding "
EncodingName -> EncodingName -> EncodingName
forall a. [a] -> [a] -> [a]
++ EncodingName -> EncodingName
forall a. Show a => a -> EncodingName
show EncodingName
fromEncoding EncodingName -> EncodingName -> EncodingName
forall a. [a] -> [a] -> [a]
++ EncodingName
" to string encoding "
EncodingName -> EncodingName -> EncodingName
forall a. [a] -> [a] -> [a]
++ EncodingName -> EncodingName
forall a. Show a => a -> EncodingName
show EncodingName
toEncoding
InvalidChar Int
inputPos -> EncodingName -> IOError
err (EncodingName -> IOError) -> EncodingName -> IOError
forall a b. (a -> b) -> a -> b
$ EncodingName
"invalid input sequence at byte offset "
EncodingName -> EncodingName -> EncodingName
forall a. [a] -> [a] -> [a]
++ Int -> EncodingName
forall a. Show a => a -> EncodingName
show Int
inputPos
IncompleteChar Int
inputPos -> EncodingName -> IOError
err (EncodingName -> IOError) -> EncodingName -> IOError
forall a b. (a -> b) -> a -> b
$ EncodingName
"incomplete input sequence at byte offset "
EncodingName -> EncodingName -> EncodingName
forall a. [a] -> [a] -> [a]
++ Int -> EncodingName
forall a. Show a => a -> EncodingName
show Int
inputPos
UnexpectedError Errno
errno -> EncodingName
-> Errno -> Maybe Handle -> Maybe EncodingName -> IOError
C.Error.errnoToIOError
EncodingName
"Codec.Text.IConv: unexpected error" Errno
errno
Maybe Handle
forall a. Maybe a
Nothing Maybe EncodingName
forall a. Maybe a
Nothing
where err :: EncodingName -> IOError
err EncodingName
msg = EncodingName -> IOError
userError (EncodingName -> IOError) -> EncodingName -> IOError
forall a b. (a -> b) -> a -> b
$ EncodingName
"Codec.Text.IConv: " EncodingName -> EncodingName -> EncodingName
forall a. [a] -> [a] -> [a]
++ EncodingName
msg
{-# NOINLINE convert #-}
convert :: EncodingName
-> EncodingName
-> L.ByteString
-> L.ByteString
convert :: EncodingName -> EncodingName -> ByteString -> ByteString
convert EncodingName
fromEncoding EncodingName
toEncoding =
[StrictByteString] -> ByteString
L.fromChunks
([StrictByteString] -> ByteString)
-> (ByteString -> [StrictByteString]) -> ByteString -> ByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Span -> [StrictByteString] -> [StrictByteString])
-> [StrictByteString] -> [Span] -> [StrictByteString]
forall a b. (a -> b -> b) -> b -> [a] -> b
forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr Span -> [StrictByteString] -> [StrictByteString]
span []
([Span] -> [StrictByteString])
-> (ByteString -> [Span]) -> ByteString -> [StrictByteString]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. EncodingName -> EncodingName -> ByteString -> [Span]
convertLazily EncodingName
fromEncoding EncodingName
toEncoding
where
span :: Span -> [StrictByteString] -> [StrictByteString]
span (Span StrictByteString
c) [StrictByteString]
cs = StrictByteString
c StrictByteString -> [StrictByteString] -> [StrictByteString]
forall a. a -> [a] -> [a]
: [StrictByteString]
cs
span (ConversionError ConversionError
e) [StrictByteString]
_ =
#if MIN_VERSION_base(4,0,0)
IOError -> [StrictByteString]
forall a e. (?callStack::CallStack, Exception e) => e -> a
Exception.throw (ConversionError -> IOError
reportConversionError ConversionError
e)
#else
Exception.throw (Exception.IOException (reportConversionError e))
#endif
data Fuzzy = Transliterate | Discard
convertFuzzy :: Fuzzy
-> EncodingName
-> EncodingName
-> L.ByteString
-> L.ByteString
convertFuzzy :: Fuzzy -> EncodingName -> EncodingName -> ByteString -> ByteString
convertFuzzy Fuzzy
fuzzy EncodingName
fromEncoding EncodingName
toEncoding =
[StrictByteString] -> ByteString
L.fromChunks
([StrictByteString] -> ByteString)
-> (ByteString -> [StrictByteString]) -> ByteString -> ByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Span -> [StrictByteString] -> [StrictByteString])
-> [StrictByteString] -> [Span] -> [StrictByteString]
forall a b. (a -> b -> b) -> b -> [a] -> b
forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr Span -> [StrictByteString] -> [StrictByteString]
span []
([Span] -> [StrictByteString])
-> (ByteString -> [Span]) -> ByteString -> [StrictByteString]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. InvalidCharBehaviour
-> EncodingName -> EncodingName -> ByteString -> [Span]
convertInternal InvalidCharBehaviour
IgnoreInvalidChar EncodingName
fromEncoding (EncodingName
toEncoding EncodingName -> EncodingName -> EncodingName
forall a. [a] -> [a] -> [a]
++ EncodingName
mode)
where
mode :: EncodingName
mode = case Fuzzy
fuzzy of
Fuzzy
Transliterate -> EncodingName
"//IGNORE,TRANSLIT"
Fuzzy
Discard -> EncodingName
"//IGNORE"
span :: Span -> [StrictByteString] -> [StrictByteString]
span (Span StrictByteString
c) [StrictByteString]
cs = StrictByteString
c StrictByteString -> [StrictByteString] -> [StrictByteString]
forall a. a -> [a] -> [a]
: [StrictByteString]
cs
span (ConversionError ConversionError
_) [StrictByteString]
cs = [StrictByteString]
cs
{-# NOINLINE convertStrictly #-}
convertStrictly :: EncodingName
-> EncodingName
-> L.ByteString
-> Either L.ByteString
ConversionError
convertStrictly :: EncodingName
-> EncodingName -> ByteString -> Either ByteString ConversionError
convertStrictly EncodingName
fromEncoding EncodingName
toEncoding =
[StrictByteString] -> [Span] -> Either ByteString ConversionError
strictify []
([Span] -> Either ByteString ConversionError)
-> (ByteString -> [Span])
-> ByteString
-> Either ByteString ConversionError
forall b c a. (b -> c) -> (a -> b) -> a -> c
. EncodingName -> EncodingName -> ByteString -> [Span]
convertLazily EncodingName
fromEncoding EncodingName
toEncoding
where
strictify :: [S.ByteString] -> [Span] -> Either L.ByteString ConversionError
strictify :: [StrictByteString] -> [Span] -> Either ByteString ConversionError
strictify [StrictByteString]
cs [] = ByteString -> Either ByteString ConversionError
forall a b. a -> Either a b
Left ([StrictByteString] -> ByteString
L.fromChunks ([StrictByteString] -> [StrictByteString]
forall a. [a] -> [a]
reverse [StrictByteString]
cs))
strictify [StrictByteString]
cs (Span StrictByteString
c : [Span]
ss) = [StrictByteString] -> [Span] -> Either ByteString ConversionError
strictify (StrictByteString
cStrictByteString -> [StrictByteString] -> [StrictByteString]
forall a. a -> [a] -> [a]
:[StrictByteString]
cs) [Span]
ss
strictify [StrictByteString]
_ (ConversionError ConversionError
e:[Span]
_) = ConversionError -> Either ByteString ConversionError
forall a b. b -> Either a b
Right ConversionError
e
{-# NOINLINE convertLazily #-}
convertLazily :: EncodingName
-> EncodingName
-> L.ByteString
-> [Span]
convertLazily :: EncodingName -> EncodingName -> ByteString -> [Span]
convertLazily = InvalidCharBehaviour
-> EncodingName -> EncodingName -> ByteString -> [Span]
convertInternal InvalidCharBehaviour
StopOnInvalidChar
data InvalidCharBehaviour = StopOnInvalidChar | IgnoreInvalidChar
convertInternal :: InvalidCharBehaviour
-> EncodingName -> EncodingName
-> L.ByteString -> [Span]
convertInternal :: InvalidCharBehaviour
-> EncodingName -> EncodingName -> ByteString -> [Span]
convertInternal InvalidCharBehaviour
ignore EncodingName
fromEncoding EncodingName
toEncoding ByteString
input =
EncodingName
-> EncodingName -> (InitStatus -> IConv [Span]) -> [Span]
forall a.
EncodingName -> EncodingName -> (InitStatus -> IConv a) -> a
IConv.run EncodingName
fromEncoding EncodingName
toEncoding ((InitStatus -> IConv [Span]) -> [Span])
-> (InitStatus -> IConv [Span]) -> [Span]
forall a b. (a -> b) -> a -> b
$ \InitStatus
status -> case InitStatus
status of
InitStatus
IConv.InitOk -> do Int -> IConv ()
IConv.newOutputBuffer Int
outChunkSize
InvalidCharBehaviour -> [StrictByteString] -> IConv [Span]
fillInputBuffer InvalidCharBehaviour
ignore (ByteString -> [StrictByteString]
L.toChunks ByteString
input)
InitStatus
IConv.UnsupportedConversion -> ConversionError -> IConv [Span]
failConversion (EncodingName -> EncodingName -> ConversionError
UnsuportedConversion
EncodingName
fromEncoding
EncodingName
toEncoding)
IConv.UnexpectedInitError Errno
errno -> ConversionError -> IConv [Span]
failConversion (Errno -> ConversionError
UnexpectedError Errno
errno)
fillInputBuffer :: InvalidCharBehaviour -> [S.ByteString] -> IConv [Span]
fillInputBuffer :: InvalidCharBehaviour -> [StrictByteString] -> IConv [Span]
fillInputBuffer InvalidCharBehaviour
ignore (StrictByteString
inChunk : [StrictByteString]
inChunks) = do
StrictByteString -> IConv ()
IConv.pushInputBuffer StrictByteString
inChunk
InvalidCharBehaviour -> [StrictByteString] -> IConv [Span]
drainBuffers InvalidCharBehaviour
ignore [StrictByteString]
inChunks
fillInputBuffer InvalidCharBehaviour
_ignore [] = do
outputBufferBytesAvailable <- IConv Int
IConv.outputBufferBytesAvailable
IConv.finalise
if outputBufferBytesAvailable > 0
then do outChunk <- IConv.popOutputBuffer
return [Span outChunk]
else return []
drainBuffers :: InvalidCharBehaviour -> [S.ByteString] -> IConv [Span]
drainBuffers :: InvalidCharBehaviour -> [StrictByteString] -> IConv [Span]
drainBuffers InvalidCharBehaviour
ignore [StrictByteString]
inChunks = do
inputBufferEmpty_ <- IConv Bool
IConv.inputBufferEmpty
outputBufferFull <- IConv.outputBufferFull
assert (not outputBufferFull && not inputBufferEmpty_) $ return ()
status <- IConv.iconv
case status of
Status
IConv.InputEmpty -> do
inputBufferEmpty <- IConv Bool
IConv.inputBufferEmpty
assert inputBufferEmpty $ fillInputBuffer ignore inChunks
Status
IConv.OutputFull -> do
outChunk <- IConv StrictByteString
IConv.popOutputBuffer
outChunks <- IConv.unsafeInterleave $ do
IConv.newOutputBuffer outChunkSize
drainBuffers ignore inChunks
return (Span outChunk : outChunks)
Status
IConv.InvalidChar -> InvalidCharBehaviour -> [StrictByteString] -> IConv [Span]
invalidChar InvalidCharBehaviour
ignore [StrictByteString]
inChunks
Status
IConv.IncompleteChar -> InvalidCharBehaviour -> [StrictByteString] -> IConv [Span]
fixupBoundary InvalidCharBehaviour
ignore [StrictByteString]
inChunks
IConv.UnexpectedError Errno
errno -> ConversionError -> IConv [Span]
failConversion (Errno -> ConversionError
UnexpectedError Errno
errno)
fixupBoundary :: InvalidCharBehaviour -> [S.ByteString] -> IConv [Span]
fixupBoundary :: InvalidCharBehaviour -> [StrictByteString] -> IConv [Span]
fixupBoundary InvalidCharBehaviour
_ignore [] = do
inputPos <- IConv Int
IConv.inputPosition
failConversion (IncompleteChar inputPos)
fixupBoundary InvalidCharBehaviour
ignore inChunks :: [StrictByteString]
inChunks@(StrictByteString
inChunk : [StrictByteString]
inChunks') = do
inSize <- IConv Int
IConv.inputBufferSize
assert (inSize < tmpChunkSize) $ return ()
let extraBytes = Int
tmpChunkSize Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
inSize
if S.length inChunk <= extraBytes
then do
IConv.replaceInputBuffer (`S.append` inChunk)
drainBuffers ignore inChunks'
else do
IConv.replaceInputBuffer (`S.append` S.take extraBytes inChunk)
before <- IConv.inputBufferSize
assert (before == tmpChunkSize) $ return ()
status <- IConv.iconv
after <- IConv.inputBufferSize
let consumed = Int
before Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
after
case status of
Status
IConv.InputEmpty ->
Bool -> IConv [Span] -> IConv [Span]
forall a. (?callStack::CallStack) => Bool -> a -> a
assert (Int
consumed Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
tmpChunkSize) (IConv [Span] -> IConv [Span]) -> IConv [Span] -> IConv [Span]
forall a b. (a -> b) -> a -> b
$
InvalidCharBehaviour -> [StrictByteString] -> IConv [Span]
fillInputBuffer InvalidCharBehaviour
ignore (Int -> StrictByteString -> StrictByteString
S.drop Int
extraBytes StrictByteString
inChunk StrictByteString -> [StrictByteString] -> [StrictByteString]
forall a. a -> [a] -> [a]
: [StrictByteString]
inChunks')
Status
IConv.OutputFull -> do
outChunk <- IConv StrictByteString
IConv.popOutputBuffer
outChunks <- IConv.unsafeInterleave $ do
IConv.newOutputBuffer outChunkSize
drainBuffers ignore inChunks
return (Span outChunk : outChunks)
Status
IConv.InvalidChar -> InvalidCharBehaviour -> [StrictByteString] -> IConv [Span]
invalidChar InvalidCharBehaviour
ignore [StrictByteString]
inChunks
Status
IConv.IncompleteChar ->
Bool -> IConv [Span] -> IConv [Span]
forall a. (?callStack::CallStack) => Bool -> a -> a
assert (Int
inSize Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
consumed Bool -> Bool -> Bool
&& Int
consumed Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
tmpChunkSize) (IConv [Span] -> IConv [Span]) -> IConv [Span] -> IConv [Span]
forall a b. (a -> b) -> a -> b
$
InvalidCharBehaviour -> [StrictByteString] -> IConv [Span]
drainBuffers InvalidCharBehaviour
ignore (Int -> StrictByteString -> StrictByteString
S.drop (Int
consumed Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
inSize) StrictByteString
inChunk StrictByteString -> [StrictByteString] -> [StrictByteString]
forall a. a -> [a] -> [a]
: [StrictByteString]
inChunks')
IConv.UnexpectedError Errno
errno -> ConversionError -> IConv [Span]
failConversion (Errno -> ConversionError
UnexpectedError Errno
errno)
invalidChar :: InvalidCharBehaviour -> [S.ByteString] -> IConv [Span]
invalidChar :: InvalidCharBehaviour -> [StrictByteString] -> IConv [Span]
invalidChar InvalidCharBehaviour
StopOnInvalidChar [StrictByteString]
_ = do
inputPos <- IConv Int
IConv.inputPosition
failConversion (InvalidChar inputPos)
invalidChar InvalidCharBehaviour
IgnoreInvalidChar [StrictByteString]
inChunks = do
inputPos <- IConv Int
IConv.inputPosition
let invalidCharError = ConversionError -> Span
ConversionError (Int -> ConversionError
InvalidChar Int
inputPos)
outputBufferBytesAvailable <- IConv.outputBufferBytesAvailable
if outputBufferBytesAvailable > 0
then do outChunk <- IConv.popOutputBuffer
outChunks <- IConv.unsafeInterleave $ do
IConv.newOutputBuffer outChunkSize
inputBufferEmpty <- IConv.inputBufferEmpty
if inputBufferEmpty
then fillInputBuffer IgnoreInvalidChar inChunks
else drainBuffers IgnoreInvalidChar inChunks
return (Span outChunk : invalidCharError : outChunks)
else do outChunks <- IConv.unsafeInterleave $ do
IConv.newOutputBuffer outChunkSize
inputBufferEmpty <- IConv.inputBufferEmpty
if inputBufferEmpty
then fillInputBuffer IgnoreInvalidChar inChunks
else drainBuffers IgnoreInvalidChar inChunks
return (invalidCharError : outChunks)
failConversion :: ConversionError -> IConv [Span]
failConversion :: ConversionError -> IConv [Span]
failConversion ConversionError
err = do
outputBufferBytesAvailable <- IConv Int
IConv.outputBufferBytesAvailable
IConv.finalise
if outputBufferBytesAvailable > 0
then do outChunk <- IConv.popOutputBuffer
return [Span outChunk, ConversionError err]
else return [ ConversionError err]
outChunkSize :: Int
outChunkSize :: Int
outChunkSize = Int
L.defaultChunkSize
tmpChunkSize :: Int
tmpChunkSize :: Int
tmpChunkSize = Int
16