{-# LINE 1 "Data/Text/ICU/CharsetDetection.hsc" #-}
{-# LANGUAGE ForeignFunctionInterface #-}
module Data.Text.ICU.CharsetDetection
(
setText
, detect
, mkCharsetDetector
, withCharsetDetector
, wrapUCharsetMatch
, CharsetMatch
, CharsetDetector
, getConfidence
, getName
, getLanguage
) where
import Foreign.Ptr (Ptr)
import Foreign.C.String (CString)
import Foreign.C.Types (CChar)
import qualified Data.ByteString as BS
import Data.ByteString (ByteString)
import qualified Data.Text.Encoding as TE
import Data.Text (Text)
import Data.Text.ICU.Error.Internal (UErrorCode, handleError)
import Data.Text.ICU.CharsetDetection.Internal (UCharsetMatch, UCharsetDetector,
CharsetDetector, CharsetMatch,
mkCharsetDetector,
withCharsetDetector,
withCharsetMatch,
wrapUCharsetMatch)
setText :: ByteString -> CharsetDetector -> IO ()
setText :: ByteString -> CharsetDetector -> IO ()
setText ByteString
bs CharsetDetector
ucsd = CharsetDetector -> (Ptr UCharsetDetector -> IO ()) -> IO ()
forall a. CharsetDetector -> (Ptr UCharsetDetector -> IO a) -> IO a
withCharsetDetector CharsetDetector
ucsd Ptr UCharsetDetector -> IO ()
go
where
go :: Ptr UCharsetDetector -> IO ()
go Ptr UCharsetDetector
u = if ByteString -> Int
BS.length ByteString
bs Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
512
then ByteString -> (CStringLen -> IO ()) -> IO ()
forall a. ByteString -> (CStringLen -> IO a) -> IO a
BS.useAsCStringLen ByteString
bs (\(Ptr CChar
text,Int
size) -> (Ptr UErrorCode -> IO ()) -> IO ()
forall a. (Ptr UErrorCode -> IO a) -> IO a
handleError ((Ptr UErrorCode -> IO ()) -> IO ())
-> (Ptr UErrorCode -> IO ()) -> IO ()
forall a b. (a -> b) -> a -> b
$ Ptr UCharsetDetector -> Ptr CChar -> Int -> Ptr UErrorCode -> IO ()
ucsdet_setText Ptr UCharsetDetector
u Ptr CChar
text Int
size)
else ByteString -> (CStringLen -> IO ()) -> IO ()
forall a. ByteString -> (CStringLen -> IO a) -> IO a
BS.useAsCStringLen (Int -> ByteString -> ByteString
BS.take Int
512 ByteString
bs) (\(Ptr CChar
text,Int
size) -> (Ptr UErrorCode -> IO ()) -> IO ()
forall a. (Ptr UErrorCode -> IO a) -> IO a
handleError ((Ptr UErrorCode -> IO ()) -> IO ())
-> (Ptr UErrorCode -> IO ()) -> IO ()
forall a b. (a -> b) -> a -> b
$ Ptr UCharsetDetector -> Ptr CChar -> Int -> Ptr UErrorCode -> IO ()
ucsdet_setText Ptr UCharsetDetector
u Ptr CChar
text Int
size)
detect :: ByteString -> IO CharsetMatch
detect :: ByteString -> IO CharsetMatch
detect ByteString
bs = do
ucsd <- IO CharsetDetector
mkCharsetDetector
setText bs ucsd
wrapUCharsetMatch ucsd $ withCharsetDetector ucsd (handleError . ucsdet_detect)
getConfidence :: CharsetMatch -> IO Int
getConfidence :: CharsetMatch -> IO Int
getConfidence CharsetMatch
ucm = CharsetMatch -> (Ptr UCharsetMatch -> IO Int) -> IO Int
forall a. CharsetMatch -> (Ptr UCharsetMatch -> IO a) -> IO a
withCharsetMatch CharsetMatch
ucm ((Ptr UCharsetMatch -> IO Int) -> IO Int)
-> (Ptr UCharsetMatch -> IO Int) -> IO Int
forall a b. (a -> b) -> a -> b
$ (Ptr UErrorCode -> IO Int) -> IO Int
forall a. (Ptr UErrorCode -> IO a) -> IO a
handleError ((Ptr UErrorCode -> IO Int) -> IO Int)
-> (Ptr UCharsetMatch -> Ptr UErrorCode -> IO Int)
-> Ptr UCharsetMatch
-> IO Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Ptr UCharsetMatch -> Ptr UErrorCode -> IO Int
ucsdet_getConfidence
getName :: CharsetMatch -> IO Text
getName :: CharsetMatch -> IO Text
getName CharsetMatch
ucsm = do
bs <- CharsetMatch
-> (Ptr UCharsetMatch -> IO (Ptr CChar)) -> IO (Ptr CChar)
forall a. CharsetMatch -> (Ptr UCharsetMatch -> IO a) -> IO a
withCharsetMatch CharsetMatch
ucsm ((Ptr UErrorCode -> IO (Ptr CChar)) -> IO (Ptr CChar)
forall a. (Ptr UErrorCode -> IO a) -> IO a
handleError ((Ptr UErrorCode -> IO (Ptr CChar)) -> IO (Ptr CChar))
-> (Ptr UCharsetMatch -> Ptr UErrorCode -> IO (Ptr CChar))
-> Ptr UCharsetMatch
-> IO (Ptr CChar)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Ptr UCharsetMatch -> Ptr UErrorCode -> IO (Ptr CChar)
ucsdet_getName) IO (Ptr CChar) -> (Ptr CChar -> IO ByteString) -> IO ByteString
forall a b. IO a -> (a -> IO b) -> IO b
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= Ptr CChar -> IO ByteString
BS.packCString
return $ TE.decodeUtf8 bs
getLanguage :: CharsetMatch -> IO Text
getLanguage :: CharsetMatch -> IO Text
getLanguage CharsetMatch
ucsm = do
bs <- CharsetMatch
-> (Ptr UCharsetMatch -> IO (Ptr CChar)) -> IO (Ptr CChar)
forall a. CharsetMatch -> (Ptr UCharsetMatch -> IO a) -> IO a
withCharsetMatch CharsetMatch
ucsm ((Ptr UErrorCode -> IO (Ptr CChar)) -> IO (Ptr CChar)
forall a. (Ptr UErrorCode -> IO a) -> IO a
handleError ((Ptr UErrorCode -> IO (Ptr CChar)) -> IO (Ptr CChar))
-> (Ptr UCharsetMatch -> Ptr UErrorCode -> IO (Ptr CChar))
-> Ptr UCharsetMatch
-> IO (Ptr CChar)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Ptr UCharsetMatch -> Ptr UErrorCode -> IO (Ptr CChar)
ucsdet_getLanguage) IO (Ptr CChar) -> (Ptr CChar -> IO ByteString) -> IO ByteString
forall a b. IO a -> (a -> IO b) -> IO b
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= Ptr CChar -> IO ByteString
BS.packCString
return $ TE.decodeUtf8 bs
foreign import ccall unsafe "hs_text_icu.h __hs_ucsdet_setText" ucsdet_setText
:: Ptr UCharsetDetector -> Ptr CChar -> Int -> Ptr UErrorCode -> IO ()
foreign import ccall unsafe "hs_text_icu.h __hs_ucsdet_detect" ucsdet_detect
:: Ptr UCharsetDetector -> Ptr UErrorCode -> IO (Ptr UCharsetMatch)
foreign import ccall unsafe "hs_text_icu.h __hs_ucsdet_getConfidence" ucsdet_getConfidence
:: Ptr UCharsetMatch -> Ptr UErrorCode -> IO Int
foreign import ccall unsafe "hs_text_icu.h __hs_ucsdet_getName" ucsdet_getName
:: Ptr UCharsetMatch -> Ptr UErrorCode -> IO CString
foreign import ccall unsafe "hs_text_icu.h __hs_ucsdet_getLanguage" ucsdet_getLanguage
:: Ptr UCharsetMatch -> Ptr UErrorCode -> IO CString