{-# LANGUAGE Rank2Types #-} -- Random and Binary IO with IterateeM -- A general-purpose TIFF library -- The library gives the user the TIFF dictionary, which the user -- can search for specific tags and obtain the values associated with -- the tags, including the pixel matrix. -- -- The overarching theme is incremental processing: initially, -- only the TIFF dictionary is read. The value associated with a tag -- is read only when that tag is looked up (unless the value was short -- and was packed in the TIFF dictionary entry). The pixel matrix -- (let alone the whole TIFF file) is not loaded in memory -- -- the pixel matrix is not even located before it is needed. -- The matrix is processed incrementally, by a user-supplied -- iteratee. -- -- The incremental processing is accomplished by iteratees and enumerators. -- The enumerators are indeed first-class, they are stored -- in the interned TIFF dictionary data structure. These enumerators -- represent the values associated with tags; the values will be read -- on demand, when the enumerator is applied to a user-given iteratee. -- -- The library extensively uses nested streams, tacitly converting the -- stream of raw bytes from the file into streams of integers, -- rationals and other user-friendly items. The pixel matrix is -- presented as a contiguous stream, regardless of its segmentation -- into strips and physical arrangement. -- The library exhibits random IO and binary parsing, reading -- of multi-byte numeric data in big- or little-endian formats. -- The library can be easily adopted for AIFF, RIFF and other -- IFF formats. -- -- This TIFF library is to be contrasted with the corresponding Scheme -- code: -- http://okmij.org/ftp/Scheme/binary-io.html#tiff -- The main distinction is using iteratees for on-demand processing. module Tiff ( tiff_reader -- The main function: load the dict , TIFFDict , TIFFException(..) -- A few helpers for getting data from TIFF dictionary , dict_read_int , dict_read_ints , dict_read_rat , dict_read_string , pixel_matrix_enum -- reading the pixel matrix -- TIFF tags and types , TIFF_TYPE(..) , TIFF_TAG(..) , tag_to_int , int_to_tag ) where import IterateeM import RandomIO import Control.Monad.Trans import Data.Char (chr) import Data.Int import Data.Word import Data.Ratio import Data.Bits import Prelude hiding (head, drop, dropWhile, take, break, catch) import qualified Prelude import qualified Data.IntMap as IM import Data.Typeable import Control.Exception -- A TIFF directory is a finite map associating a TIFF tag with -- a record TIFFDE type TIFFDict = IM.IntMap TIFFDE data TIFFDE = TIFFDE{tiffde_count :: Int, -- number of items tiffde_enum :: TIFFDE_ENUM -- enumerator to get values } data TIFFDE_ENUM = TEN_CHAR (forall a. Enumeratee Word8 Char EIO a) | TEN_BYTE (forall a. Enumeratee Word8 Word8 EIO a) | TEN_INT (forall a. Enumeratee Word8 Integer EIO a) | TEN_RAT (forall a. Enumeratee Word8 Rational EIO a) -- Standard TIFF data types data TIFF_TYPE = TT_NONE -- 0 | TT_byte -- 1 8-bit unsigned integer | TT_ascii -- 2 8-bit bytes with last byte null | TT_short -- 3 16-bit unsigned integer | TT_long -- 4 32-bit unsigned integer | TT_rational -- 5 64-bit fractional (numer+denominator) -- The following was added in TIFF 6.0 | TT_sbyte -- 6 8-bit signed (2s-complement) integer | TT_undefined -- 7 An 8-bit byte, "8-bit chunk" | TT_sshort -- 8 16-bit signed (2s-complement) integer | TT_slong -- 9 32-bit signed (2s-complement) integer | TT_srational -- 10 "signed rational", two SLONGs (num+denominator) | TT_float -- 11 "IEEE 32-bit float", single precision (4-byte) | TT_double -- 12 "IEEE 64-bit double", double precision (8-byte) deriving (Eq, Enum, Ord, Bounded, Show) -- Standard TIFF tags data TIFF_TAG = TG_other Int -- other than below | TG_SUBFILETYPE -- subfile data descriptor | TG_OSUBFILETYPE -- +kind of data in subfile | TG_IMAGEWIDTH -- image width in pixels | TG_IMAGELENGTH -- image height in pixels | TG_BITSPERSAMPLE -- bits per channel (sample) | TG_COMPRESSION -- data compression technique | TG_PHOTOMETRIC -- photometric interpretation | TG_THRESHOLDING -- +thresholding used on data | TG_CELLWIDTH -- +dithering matrix width | TG_CELLLENGTH -- +dithering matrix height | TG_FILLORDER -- +data order within a byte | TG_DOCUMENTNAME -- name of doc. image is from | TG_IMAGEDESCRIPTION -- info about image | TG_MAKE -- scanner manufacturer name | TG_MODEL -- scanner model name/number | TG_STRIPOFFSETS -- offsets to data strips | TG_ORIENTATION -- +image orientation | TG_SAMPLESPERPIXEL -- samples per pixel | TG_ROWSPERSTRIP -- rows per strip of data | TG_STRIPBYTECOUNTS -- bytes counts for strips | TG_MINSAMPLEVALUE -- +minimum sample value | TG_MAXSAMPLEVALUE -- maximum sample value | TG_XRESOLUTION -- pixels/resolution in x | TG_YRESOLUTION -- pixels/resolution in y | TG_PLANARCONFIG -- storage organization | TG_PAGENAME -- page name image is from | TG_XPOSITION -- x page offset of image lhs | TG_YPOSITION -- y page offset of image lhs | TG_FREEOFFSETS -- +byte offset to free block | TG_FREEBYTECOUNTS -- +sizes of free blocks | TG_GRAYRESPONSEUNIT -- gray scale curve accuracy | TG_GRAYRESPONSECURVE -- gray scale response curve | TG_GROUP3OPTIONS -- 32 flag bits | TG_GROUP4OPTIONS -- 32 flag bits | TG_RESOLUTIONUNIT -- units of resolutions | TG_PAGENUMBER -- page numbers of multi-page | TG_COLORRESPONSEUNIT -- color scale curve accuracy | TG_COLORRESPONSECURVE -- RGB response curve | TG_SOFTWARE -- name & release | TG_DATETIME -- creation date and time | TG_ARTIST -- creator of image | TG_HOSTCOMPUTER -- machine where created | TG_PREDICTOR -- prediction scheme w/ LZW | TG_WHITEPOINT -- image white point | TG_PRIMARYCHROMATICITIES -- primary chromaticities | TG_COLORMAP -- RGB map for pallette image | TG_BADFAXLINES -- lines w/ wrong pixel count | TG_CLEANFAXDATA -- regenerated line info | TG_CONSECUTIVEBADFAXLINES -- max consecutive bad lines | TG_MATTEING -- alpha channel is present deriving (Eq, Show) tag_map = [ (TG_SUBFILETYPE,254), (TG_OSUBFILETYPE,255), (TG_IMAGEWIDTH,256), (TG_IMAGELENGTH,257), (TG_BITSPERSAMPLE,258), (TG_COMPRESSION,259), (TG_PHOTOMETRIC,262), (TG_THRESHOLDING,263), (TG_CELLWIDTH,264), (TG_CELLLENGTH,265), (TG_FILLORDER,266), (TG_DOCUMENTNAME,269), (TG_IMAGEDESCRIPTION,270), (TG_MAKE,271), (TG_MODEL,272), (TG_STRIPOFFSETS,273), (TG_ORIENTATION,274), (TG_SAMPLESPERPIXEL,277), (TG_ROWSPERSTRIP,278), (TG_STRIPBYTECOUNTS,279), (TG_MINSAMPLEVALUE,280), (TG_MAXSAMPLEVALUE,281), (TG_XRESOLUTION,282), (TG_YRESOLUTION,283), (TG_PLANARCONFIG,284), (TG_PAGENAME,285), (TG_XPOSITION,286), (TG_YPOSITION,287), (TG_FREEOFFSETS,288), (TG_FREEBYTECOUNTS,289), (TG_GRAYRESPONSEUNIT,290), (TG_GRAYRESPONSECURVE,291), (TG_GROUP3OPTIONS,292), (TG_GROUP4OPTIONS,293), (TG_RESOLUTIONUNIT,296), (TG_PAGENUMBER,297), (TG_COLORRESPONSEUNIT,300), (TG_COLORRESPONSECURVE,301), (TG_SOFTWARE,305), (TG_DATETIME,306), (TG_ARTIST,315), (TG_HOSTCOMPUTER,316), (TG_PREDICTOR,317), (TG_WHITEPOINT,318), (TG_PRIMARYCHROMATICITIES,319), (TG_COLORMAP,320), (TG_BADFAXLINES,326), (TG_CLEANFAXDATA,327), (TG_CONSECUTIVEBADFAXLINES,328), (TG_MATTEING,32995) ] tag_map' = IM.fromList $ map (\(tag,v) -> (v,tag)) tag_map tag_to_int :: TIFF_TAG -> Int tag_to_int (TG_other x) = x tag_to_int x = maybe (error $ "not found tag: " ++ show x) id $ lookup x tag_map int_to_tag :: Int -> TIFF_TAG int_to_tag x = maybe (TG_other x) id $ IM.lookup x tag_map' data TIFFException = TIFFException String deriving Show instance Typeable TIFFException where typeOf _ = mkTyConApp (mkTyCon "TIFFException") [] instance Exception TIFFException throw_err :: Monad m => String -> Iteratee el m a throw_err str = throwErr (toException (TIFFException str)) -- The library function to read the TIFF dictionary tiff_reader :: Iteratee Word8 EIO TIFFDict tiff_reader = do read_magic check_version dict_offset <- endian_read4 seek_stream (fromIntegral dict_offset) load_dict where -- Read the magic and set the endianness read_magic = do c1 <- head c2 <- head case (c1,c2) of (0x4d, 0x4d) -> lift $ eio_set_endian Big_endian -- MM magic (0x49, 0x49) -> lift $ eio_set_endian Little_endian -- II magic _ -> throw_err $ "Bad TIFF magic word: " ++ show [c1,c2] -- Check the version in the header. It is always ... tiff_version = 42 check_version = do v <- endian_read2 if v == tiff_version then return () else throw_err $ "Bad TIFF version: " ++ show v -- A few conversion procedures u32_to_float :: Word32 -> Double u32_to_float x = -- unsigned 32-bit int -> IEEE float error "u32->float is not yet implemented" u32_to_s32 :: Word32 -> Int32 -- unsigned 32-bit int -> signed 32 bit u32_to_s32 = fromIntegral -- u32_to_s32 0x7fffffff == 0x7fffffff -- u32_to_s32 0xffffffff == -1 u16_to_s16 :: Word16 -> Int16 -- unsigned 16-bit int -> signed 16 bit u16_to_s16 = fromIntegral -- u16_to_s16 32767 == 32767 -- u16_to_s16 32768 == -32768 -- u16_to_s16 65535 == -1 u8_to_s8 :: Word8 -> Int8 -- unsigned 8-bit int -> signed 8 bit u8_to_s8 = fromIntegral -- u8_to_s8 127 == 127 -- u8_to_s8 128 == -128 -- u8_to_s8 255 == -1 note :: [String] -> Iteratee el EIO () note = liftIO . putStrLn . concat -- An internal function to load the dictionary. It assumes that the stream -- is positioned to read the dictionary load_dict :: Iteratee Word8 EIO TIFFDict load_dict = do nentries <- endian_read2 dict <- foldr (const read_entry) (return IM.empty) [1..nentries] next_dict <- endian_read4 if next_dict > 0 then note ["The TIFF file contains several images, ", "only the first one will be considered"] else return () return dict where read_entry dictM = do dict <- dictM tag <- endian_read2 typ <- endian_read2 >>= convert_type . fromIntegral count <- endian_read4 -- we read the val-offset later. We need to check the size and the type -- of the datum, because val-offset may contain the value itself, -- in its lower-numbered bytes, regardless of the big/little endian -- order! note ["TIFFEntry: tag ",show . int_to_tag . fromIntegral $ tag, " type ", show typ, " count ", show count] enum <- read_value typ (fromIntegral count) case enum of Just enum -> return $ IM.insert (fromIntegral tag) (TIFFDE (fromIntegral count) enum) dict _ -> return dict convert_type :: Monad m => Int -> Iteratee el m TIFF_TYPE convert_type typ | typ > 0 && typ <= fromEnum (maxBound::TIFF_TYPE) = return . toEnum $ typ convert_type typ = throw_err $ "Bad type of entry: " ++ show typ read_value :: TIFF_TYPE -> Int -> Iteratee Word8 EIO (Maybe TIFFDE_ENUM) read_value typ 0 = do offset <- endian_read4 throw_err $ "Zero count in the entry of type: " ++ show typ return Nothing -- Read an ascii string from the offset in the -- dictionary. The last byte of -- an ascii string is always zero, which is -- included in 'count' but we don't need to read it -- Furthermore, if iter_char finishes early, -- we stop further reading read_value TT_ascii count | count > 4 = do -- for sure, val-offset is offset offset <- endian_read4 return . Just $ TEN_CHAR (\iter_char -> do seek_stream (fromIntegral offset) runI =<< takeR (pred count) (map_stream (chr . fromIntegral) iter_char)) -- Read the string of 0 to 3 characters long -- The zero terminator is included in count, but -- we don't need to read it read_value TT_ascii count = do -- count is within 1..4 let len = pred count -- string length str <- mapM (const head) [1..len] -- Always tries to read len elem drop (4-len) return . Just $ TEN_CHAR (immed_value $ map (chr . fromIntegral) str) -- Read the array of signed or unsigned bytes read_value typ count | count > 4 && typ == TT_byte || typ == TT_sbyte = do offset <- endian_read4 return . Just $ TEN_INT (\iter_int -> do seek_stream (fromIntegral offset) runI =<< takeR (pred count) (map_stream (conv_byte typ) iter_int)) -- Read the array of 1 to 4 bytes read_value typ count | typ == TT_byte || typ == TT_sbyte = do let loop acc 0 = return . reverse $ acc loop acc n = head >>= \v -> loop ((conv_byte typ $ v):acc) (pred n) str <- loop [] count drop (4-count) return . Just $ TEN_INT (immed_value str) -- Read the array of Word8 read_value TT_undefined count | count > 4 = do offset <- endian_read4 return . Just $ TEN_BYTE (\iter -> do seek_stream (fromIntegral offset) takeR count iter) -- Read the array of Word8 of 1..4 elements, -- packed in the offset field read_value TT_undefined count = do str <- sequence $ replicate count head drop (4-count) return . Just $ TEN_BYTE (immed_value str) -- Read the array of short integers -- of 1 element: the offset field contains the value read_value typ 1 | typ == TT_short || typ == TT_sshort = do item <- endian_read2 drop 2 -- skip the padding return . Just $ TEN_INT (immed_value [conv_short typ item]) -- of 2 elements: the offset field contains the value read_value typ 2 | typ == TT_short || typ == TT_sshort = do i1 <- endian_read2 i2 <- endian_read2 return . Just $ TEN_INT (immed_value [conv_short typ i1, conv_short typ i2]) -- of n elements read_value typ count | typ == TT_short || typ == TT_sshort = do offset <- endian_read4 let converter = endian_read2 >>= return . conv_short typ return . Just $ TEN_INT (\iter_int -> do seek_stream (fromIntegral offset) runI =<< takeR (2*count) (sequence_stream converter iter_int)) -- Read the array of long integers -- of 1 element: the offset field contains the value read_value typ 1 | typ == TT_long || typ == TT_slong = do item <- endian_read4 return . Just $ TEN_INT (immed_value [conv_long typ item]) -- of n elements read_value typ count | typ == TT_long || typ == TT_slong = do offset <- endian_read4 let converter = endian_read4 >>= return . conv_long typ return . Just $ TEN_INT (\iter_int -> do seek_stream (fromIntegral offset) runI =<< takeR (4*count) (sequence_stream converter iter_int)) -- Read the array of rationals. A rational can't -- be packed into the offset field read_value typ count | typ == TT_rational || typ == TT_srational = do offset <- endian_read4 let converter = do i1 <- endian_read4 i2 <- endian_read4 return $ conv_rat typ i1 i2 return . Just $ TEN_RAT (\iter_rat -> do seek_stream (fromIntegral offset) runI =<< takeR (8*count) (sequence_stream converter iter_rat)) read_value typ count = do -- stub offset <- endian_read4 note ["unhandled type: ", show typ, " with count ", show count] return Nothing immed_value :: [el] -> Enumeratee Word8 el EIO a immed_value item iter = lift $ enum_pure_1chunk item iter conv_byte :: TIFF_TYPE -> Word8 -> Integer conv_byte TT_byte = fromIntegral conv_byte TT_sbyte = fromIntegral . u8_to_s8 conv_short :: TIFF_TYPE -> Word16 -> Integer conv_short TT_short = fromIntegral conv_short TT_sshort = fromIntegral . u16_to_s16 conv_long :: TIFF_TYPE -> Word32 -> Integer conv_long TT_long = fromIntegral conv_long TT_slong = fromIntegral . u32_to_s32 conv_rat :: TIFF_TYPE -> Word32 -> Word32 -> Rational conv_rat TT_rational v1 v2 = (fromIntegral v1) % (fromIntegral v2) conv_rat TT_srational v1 v2 = (fromIntegral (u32_to_s32 v1)) % (fromIntegral (u32_to_s32 v2)) -- Reading the pixel matrix -- For simplicity, we assume no compression and 8-bit pixels pixel_matrix_enum :: TIFFDict -> Enumeratee Word8 Word8 EIO a pixel_matrix_enum dict iter = validate_dict >>= proceed where -- Make sure we can handle this particular TIFF image validate_dict = do dict_assert TG_COMPRESSION 1 dict_assert TG_SAMPLESPERPIXEL 1 dict_assert TG_BITSPERSAMPLE 8 Just ncols <- dict_read_int TG_IMAGEWIDTH dict Just nrows <- dict_read_int TG_IMAGELENGTH dict Just strip_offsets <- dict_read_ints TG_STRIPOFFSETS dict rps <- dict_read_int TG_ROWSPERSTRIP dict >>= return . maybe nrows id if ncols > 0 && nrows > 0 && rps > 0 then return $ (ncols,nrows,rps,strip_offsets) else throw_err "Problematic TIFF image" dict_assert tag v = do vfound <- dict_read_int tag dict case vfound of Just v' | v' == v -> return () _ -> throw_err (unwords ["dict_assert: tag:", show tag, "expected:", show v, "found:", show vfound]) proceed (ncols,nrows,rows_per_strip,strip_offsets) = do let strip_size = rows_per_strip * ncols image_size = nrows * ncols note ["Processing the pixel matrix, ", show image_size, " bytes"] let loop pos _ iter@IE_done{} = return iter loop pos [] iter = return iter loop pos (strip:strips) iter = do seek_stream (fromIntegral strip) let len = min strip_size (image_size - pos) iter <- takeR (fromIntegral len) iter loop (pos+len) strips iter loop 0 strip_offsets iter -- A few helpers for getting data from TIFF dictionary dict_read_int :: TIFF_TAG -> TIFFDict -> Iteratee Word8 EIO (Maybe Integer) dict_read_int tag dict = do els <- dict_read_ints tag dict case els of Just (e:_) -> return $ Just e _ -> return Nothing dict_read_ints :: TIFF_TAG -> TIFFDict -> Iteratee Word8 EIO (Maybe [Integer]) dict_read_ints tag dict = case IM.lookup (tag_to_int tag) dict of Just (TIFFDE _ (TEN_INT enum)) -> do e <- runI =<< enum stream2list return (Just e) _ -> return Nothing dict_read_rat :: TIFF_TAG -> TIFFDict -> Iteratee Word8 EIO (Maybe Rational) dict_read_rat tag dict = case IM.lookup (tag_to_int tag) dict of Just (TIFFDE 1 (TEN_RAT enum)) -> do e <- runI =<< enum head return (Just e) _ -> return Nothing dict_read_string :: TIFF_TAG -> TIFFDict -> Iteratee Word8 EIO (Maybe String) dict_read_string tag dict = case IM.lookup (tag_to_int tag) dict of Just (TIFFDE _ (TEN_CHAR enum)) -> do e <- runI =<< enum stream2list return (Just e) _ -> return Nothing