libextractor

GNU libextractor
Log | Files | Refs | Submodules | README | LICENSE

riff_extractor.c (15405B)


      1 /*
      2      This file is part of libextractor.
      3      Copyright (C) 2004, 2009, 2012, 2025 Vidyut Samanta and Christian Grothoff
      4 
      5      libextractor is free software; you can redistribute it and/or modify
      6      it under the terms of the GNU General Public License as published
      7      by the Free Software Foundation; either version 3, or (at your
      8      option) any later version.
      9 
     10      libextractor is distributed in the hope that it will be useful, but
     11      WITHOUT ANY WARRANTY; without even the implied warranty of
     12      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     13      General Public License for more details.
     14 
     15      You should have received a copy of the GNU General Public License
     16      along with libextractor; see the file COPYING.  If not, write to the
     17      Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
     18      Boston, MA 02110-1301, USA.
     19 
     20      This code was based on AVInfo 1.0 alpha 11
     21      (c) George Shuklin, gs]AT[shounen.ru, 2002-2004
     22      http://shounen.ru/soft/avinfo/
     23 
     24      and bitcollider 0.6.0
     25      (PD) 2004 The Bitzi Corporation
     26      http://bitzi.com/
     27  */
     28 /**
     29  * @file plugins/riff_extractor.c
     30  * @brief plugin to support RIFF files (AVI, ANI, and others)
     31  * @author Christian Grothoff
     32  *
     33  * RIFF structure:
     34  *   "RIFF" (4 bytes)
     35  *   file-size - 8 (4 bytes, little-endian)
     36  *   form type (4 bytes): "AVI ", "WAVE", "ANI ", etc.
     37  *   chunks...
     38  *
     39  * Each chunk:
     40  *   chunk ID (4 bytes)
     41  *   data size (4 bytes, little-endian, excludes the 8-byte header)
     42  *   data (size bytes, padded to even offset)
     43  *
     44  * LIST chunks carry a 4-byte list type immediately after their size,
     45  * followed by sub-chunks.  LIST INFO is globally standardised and
     46  * appears in all RIFF-based formats.
     47  */
     48 #include "platform.h"
     49 #include "extractor.h"
     50 #include <math.h>
     51 
     52 
     53 /**
     54  * Read a little-endian uint32 from @a data.
     55  */
     56 static uint32_t
     57 fread_le (const char *data)
     58 {
     59   uint32_t result = 0;
     60 
     61   for (unsigned int x = 0; x < 4; x++)
     62     result |= ((unsigned char) data[x]) << (x * 8);
     63   return result;
     64 }
     65 
     66 
     67 /**
     68  * Round @a num to the nearest integer (avoids depending on C99 round()).
     69  */
     70 static double
     71 round_double (double num)
     72 {
     73   return floor (num + 0.5);
     74 }
     75 
     76 
     77 /**
     78  * Emit a UTF-8 string as metadata of the given type.
     79  * Returns from the calling function if proc signals abort.
     80  */
     81 #define ADD(s, t) do { \
     82           if (0 != ec->proc (ec->cls, "riff", (t), \
     83                              EXTRACTOR_METAFORMAT_UTF8, \
     84                              "text/plain", (s), strlen (s) + 1)) \
     85           return; \
     86 } while (0)
     87 
     88 
     89 /**
     90  * Maximum bytes we read from a single LIST INFO sub-chunk value.
     91  */
     92 #define INFO_VALUE_MAX 1024
     93 
     94 /**
     95  * Maximum number of chunks we scan at each nesting level to guard
     96  * against malformed files.
     97  */
     98 #define MAX_CHUNKS 512
     99 
    100 
    101 /**
    102  * Mapping from a LIST INFO four-CC to a libextractor meta type.
    103  */
    104 struct InfoTag
    105 {
    106   char id[4];
    107   enum EXTRACTOR_MetaType type;
    108 };
    109 
    110 static const struct InfoTag INFO_TAGS[] = {
    111   { "INAM", EXTRACTOR_METATYPE_TITLE },
    112   { "IART", EXTRACTOR_METATYPE_ARTIST },
    113   { "ICOP", EXTRACTOR_METATYPE_COPYRIGHT },
    114   { "ICRD", EXTRACTOR_METATYPE_CREATION_DATE },
    115   { "IGNR", EXTRACTOR_METATYPE_GENRE },
    116   { "IKEY", EXTRACTOR_METATYPE_KEYWORDS },
    117   { "ISFT", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE },
    118   { "ICMT", EXTRACTOR_METATYPE_COMMENT },
    119   { "ISRC", EXTRACTOR_METATYPE_SOURCE },
    120   { "ISBJ", EXTRACTOR_METATYPE_SUBJECT },
    121   { "ITRK", EXTRACTOR_METATYPE_TRACK_NUMBER },
    122   { "IPRD", EXTRACTOR_METATYPE_ALBUM },
    123   { "ILNG", EXTRACTOR_METATYPE_LANGUAGE },
    124 };
    125 
    126 
    127 /**
    128  * Video metadata accumulated from an AVI LIST hdrl.
    129  */
    130 struct AviState
    131 {
    132   uint32_t us_per_frame;  /* dwMicroSecPerFrame from avih */
    133   uint32_t total_frames;  /* dwTotalFrames from avih */
    134   uint32_t width;         /* dwWidth from avih */
    135   uint32_t height;        /* dwHeight from avih */
    136   char codec[5];          /* fccHandler from first vids strh, NUL-terminated */
    137   int have_avih;
    138   int have_codec;
    139 };
    140 
    141 
    142 /**
    143  * Seek to @a pos, then read @a want bytes into *@a data.
    144  *
    145  * @return bytes read, or -1 on seek failure
    146  */
    147 static ssize_t
    148 seek_and_read (struct EXTRACTOR_ExtractContext *ec,
    149                uint64_t pos,
    150                void **data,
    151                size_t want)
    152 {
    153   if ((int64_t) pos !=
    154       ec->seek (ec->cls,
    155                 (int64_t) pos,
    156                 SEEK_SET))
    157     return -1;
    158   return ec->read (ec->cls,
    159                    data,
    160                    want);
    161 }
    162 
    163 
    164 /**
    165  * Parse sub-chunks of a LIST INFO chunk and emit all recognised tags.
    166  *
    167  * @param ec    extraction context
    168  * @param start file offset of the first sub-chunk (immediately after list type)
    169  * @param end   file offset one past the last byte of the enclosing LIST chunk
    170  * @return 0 to continue, 1 if proc signalled abort
    171  */
    172 static int
    173 parse_list_info (struct EXTRACTOR_ExtractContext *ec,
    174                  uint64_t start,
    175                  uint64_t end)
    176 {
    177   uint64_t pos = start;
    178 
    179   for (unsigned int n = 0; (pos + 8 <= end) && (n < MAX_CHUNKS); n++)
    180   {
    181     void *data;
    182     ssize_t got;
    183     char id[4];
    184     uint32_t csz;
    185 
    186     got = seek_and_read (ec,
    187                          pos,
    188                          &data,
    189                          8);
    190     if (got < 8)
    191       break;
    192     memcpy (id,
    193             data,
    194             4);
    195     csz = fread_le ((const char *) data + 4);
    196     if ( (pos + 8 + csz > end) ||
    197          (pos + 8 + csz < pos) )
    198     {
    199       /* terminate if chunk overflows the list */
    200       break;
    201     }
    202     if (0 == csz)
    203     {
    204       /* skip empty chunk */
    205       pos += 8;
    206       continue;
    207     }
    208 
    209     for (unsigned int i = 0;
    210          i < sizeof (INFO_TAGS) / sizeof (INFO_TAGS[0]);
    211          i++)
    212     {
    213       size_t rlen = (csz < INFO_VALUE_MAX) ? (size_t) csz : INFO_VALUE_MAX;
    214       char buf[INFO_VALUE_MAX + 1];
    215       size_t slen;
    216 
    217       if (0 !=
    218           memcmp (id,
    219                   INFO_TAGS[i].id,
    220                   4))
    221         continue;
    222       got = seek_and_read (ec,
    223                            pos + 8,
    224                            &data,
    225                            rlen);
    226       if (got <= 0)
    227         break;
    228       slen = (size_t) got;
    229       memcpy (buf,
    230               data,
    231               slen);
    232       /* strip trailing NULs and spaces that some encoders pad with */
    233       while ( (slen > 0) &&
    234               ( ('\0' == buf[slen - 1]) ||
    235                 (' ' == buf[slen - 1]) ) )
    236         slen--;
    237       if (0 == slen)
    238         break;
    239       buf[slen] = '\0';
    240       if (0 != ec->proc (ec->cls,
    241                          "riff",
    242                          INFO_TAGS[i].type,
    243                          EXTRACTOR_METAFORMAT_UTF8,
    244                          "text/plain",
    245                          buf,
    246                          slen + 1))
    247         return 1;
    248       break;
    249     }
    250     pos += 8 + csz + (csz & 1);
    251   }
    252   return 0;
    253 }
    254 
    255 
    256 /**
    257  * Parse sub-chunks of a LIST strl; extract the codec fourcc from the
    258  * first video stream header found.
    259  *
    260  * @param ec    extraction context
    261  * @param start file offset of the first sub-chunk
    262  * @param end   file offset one past the last byte of the strl LIST
    263  * @param state AVI state to update
    264  */
    265 static void
    266 parse_strl (struct EXTRACTOR_ExtractContext *ec,
    267             uint64_t start,
    268             uint64_t end,
    269             struct AviState *state)
    270 {
    271   uint64_t pos = start;
    272 
    273   for (unsigned int n = 0; (pos + 8 <= end) && (n < MAX_CHUNKS); n++)
    274   {
    275     void *data;
    276     ssize_t got;
    277     char id[4];
    278     uint32_t csz;
    279 
    280     got = seek_and_read (ec,
    281                          pos,
    282                          &data,
    283                          8);
    284     if (got < 8)
    285       break;
    286     memcpy (id,
    287             data,
    288             4);
    289     csz = fread_le ((const char *) data + 4);
    290 
    291     if (! state->have_codec &&
    292         (0 == memcmp (id,
    293                       "strh",
    294                       4)) &&
    295         (csz >= 8) &&
    296         (pos + 8 + csz <= end))
    297     {
    298       /* strh layout: fccType[4] fccHandler[4] ... */
    299       got = seek_and_read (ec,
    300                            pos + 8,
    301                            &data,
    302                            8);
    303       if ( (got >= 8) &&
    304            (0 == memcmp (data,
    305                          "vids",
    306                          4)))
    307       {
    308         memcpy (state->codec,
    309                 (const char *) data + 4,
    310                 4);
    311         state->codec[4] = '\0';
    312         state->have_codec = 1;
    313       }
    314     }
    315 
    316     if (0 == csz)
    317       break;
    318     pos += 8 + csz + (csz & 1);
    319   }
    320 }
    321 
    322 
    323 /**
    324  * Parse sub-chunks of LIST hdrl in an AVI file; fills @a state with
    325  * frame timing, dimensions, and the video codec.
    326  *
    327  * @param ec    extraction context
    328  * @param start file offset of the first sub-chunk (after list type "hdrl")
    329  * @param end   file offset one past the last byte of the hdrl LIST
    330  * @param state AVI state to update
    331  */
    332 static void
    333 parse_hdrl (struct EXTRACTOR_ExtractContext *ec,
    334             uint64_t start,
    335             uint64_t end,
    336             struct AviState *state)
    337 {
    338   uint64_t pos = start;
    339 
    340   for (unsigned int n = 0; (pos + 8 <= end) && (n < MAX_CHUNKS); n++)
    341   {
    342     void *data;
    343     ssize_t got;
    344     char id[4];
    345     uint32_t csz;
    346 
    347     got = seek_and_read (ec,
    348                          pos,
    349                          &data,
    350                          8);
    351     if (got < 8)
    352       break;
    353     memcpy (id,
    354             data,
    355             4);
    356     csz = fread_le ((const char *) data + 4);
    357 
    358     if (! state->have_avih &&
    359         (0 == memcmp (id, "avih", 4)) &&
    360         (csz >= 40) &&
    361         (pos + 8 + csz <= end))
    362     {
    363       /* AVIMAINHEADER layout (all DWORDs, little-endian):
    364            [0]  dwMicroSecPerFrame
    365            [4]  dwMaxBytesPerSec
    366            [8]  dwPaddingGranularity
    367            [12] dwFlags
    368            [16] dwTotalFrames
    369            [20] dwInitialFrames
    370            [24] dwStreams
    371            [28] dwSuggestedBufferSize
    372            [32] dwWidth
    373            [36] dwHeight           */
    374       got = seek_and_read (ec,
    375                            pos + 8,
    376                            &data,
    377                            40);
    378       if (got >= 40)
    379       {
    380         const char *d = data;
    381 
    382         state->us_per_frame = fread_le (&d[0]);
    383         state->total_frames = fread_le (&d[16]);
    384         state->width        = fread_le (&d[32]);
    385         state->height       = fread_le (&d[36]);
    386         state->have_avih    = 1;
    387       }
    388     }
    389     else if ((0 == memcmp (id,
    390                            "LIST",
    391                            4)) &&
    392              (csz >= 4) &&
    393              (pos + 8 + csz <= end))
    394     {
    395       got = seek_and_read (ec,
    396                            pos + 8,
    397                            &data,
    398                            4);
    399       if ( (got >= 4) &&
    400            (0 == memcmp (data,
    401                          "strl",
    402                          4)))
    403         parse_strl (ec,
    404                     pos + 12,
    405                     pos + 8 + csz,
    406                     state);
    407     }
    408 
    409     if (0 == csz)
    410       break;
    411     pos += 8 + csz + (csz & 1);
    412   }
    413 }
    414 
    415 
    416 /**
    417  * Main entry method for the RIFF extraction plugin.
    418  * Handles any RIFF-based format; extracts LIST INFO tags universally
    419  * and AVI video stream metadata for "AVI " files.
    420  *
    421  * @param ec extraction context provided to the plugin
    422  */
    423 void
    424 EXTRACTOR_riff_extract_method (struct EXTRACTOR_ExtractContext *ec);
    425 
    426 void
    427 EXTRACTOR_riff_extract_method (struct EXTRACTOR_ExtractContext *ec)
    428 {
    429   void *data;
    430   ssize_t got;
    431   char form_type[4];
    432   uint32_t riff_size;
    433   uint64_t file_size;
    434   uint64_t riff_end;
    435   const char *mime;
    436   int is_avi;
    437   struct AviState avi;
    438 
    439   /* need at least "RIFF" + size + form type */
    440   got = ec->read (ec->cls,
    441                   &data,
    442                   12);
    443   if (got < 12)
    444     return;
    445   if (0 != memcmp (data,
    446                    "RIFF",
    447                    4))
    448     return;
    449 
    450   riff_size  = fread_le ((const char *) data + 4);
    451   memcpy (form_type,
    452           (const char *) data + 8,
    453           4);
    454 
    455   file_size = ec->get_size (ec->cls);
    456   /* riff_size counts bytes after the 8-byte RIFF header */
    457   riff_end = (uint64_t) riff_size + 8;
    458   if (riff_end > file_size)
    459     riff_end = file_size;
    460 
    461   /* map known form types to MIME strings */
    462   if (0 == memcmp (form_type,
    463                    "AVI ",
    464                    4))
    465     mime = "video/x-msvideo";
    466   else if ((0 == memcmp (form_type,
    467                          "ANI ",
    468                          4)) ||
    469            (0 == memcmp (form_type,
    470                          "ACON",
    471                          4)))
    472     mime = "application/x-navi-animation";
    473   else if (0 == memcmp (form_type,
    474                         "RMID",
    475                         4))
    476     mime = "audio/midi";
    477   else
    478     mime = NULL;   /* unknown or handled by another plugin (e.g. WAVE) */
    479 
    480   if (NULL != mime)
    481     ADD (mime,
    482          EXTRACTOR_METATYPE_MIMETYPE);
    483 
    484   is_avi = (0 == memcmp (form_type,
    485                          "AVI ",
    486                          4));
    487   memset (&avi,
    488           0,
    489           sizeof (avi));
    490 
    491   /* scan top-level chunks */
    492   {
    493     uint64_t pos = 12;
    494 
    495     for (unsigned int n = 0; (pos + 8 <= riff_end) && (n < MAX_CHUNKS); n++)
    496     {
    497       char id[4];
    498       uint32_t csz;
    499 
    500       got = seek_and_read (ec,
    501                            pos,
    502                            &data,
    503                            8);
    504       if (got < 8)
    505         break;
    506       memcpy (id,
    507               data,
    508               4);
    509       csz = fread_le ((const char *) data + 4);
    510 
    511       if (pos + 8 + (uint64_t) csz > riff_end)
    512         break;   /* chunk overflows the declared file size */
    513 
    514       if ( (0 == memcmp (id,
    515                          "LIST",
    516                          4)) &&
    517            (csz >= 4) )
    518       {
    519         char list_type[4];
    520 
    521         got = seek_and_read (ec,
    522                              pos + 8,
    523                              &data,
    524                              4);
    525         if (got >= 4)
    526         {
    527           memcpy (list_type,
    528                   data,
    529                   4);
    530 
    531           if (0 == memcmp (list_type,
    532                            "INFO",
    533                            4))
    534           {
    535             if (0 != parse_list_info (ec,
    536                                       pos + 12,
    537                                       pos + 8 + csz))
    538               return;
    539           }
    540           else if (is_avi &&
    541                    (0 == memcmp (list_type,
    542                                  "hdrl",
    543                                  4)))
    544           {
    545             parse_hdrl (ec,
    546                         pos + 12,
    547                         pos + 8 + csz,
    548                         &avi);
    549           }
    550         }
    551       }
    552 
    553       if (0 == csz)
    554         break;
    555       pos += 8 + csz + (csz & 1);
    556     }
    557   }
    558 
    559   /* emit AVI video metadata once we've scanned all chunks */
    560   if (is_avi &&
    561       avi.have_avih &&
    562       avi.have_codec &&
    563       (avi.us_per_frame > 0))
    564   {
    565     unsigned int fps =
    566       (unsigned int) round_double (1.0e6 / (double) avi.us_per_frame);
    567 
    568     if (fps > 0)
    569     {
    570       unsigned int duration =
    571         (unsigned int) round_double ((double) avi.total_frames * 1000.0
    572                                      / (double) fps);
    573       char format[256];
    574 
    575       snprintf (format,
    576                 sizeof (format),
    577                 _ ("codec: %s, %u fps, %u ms"),
    578                 avi.codec,
    579                 fps,
    580                 duration);
    581       ADD (format,
    582            EXTRACTOR_METATYPE_FORMAT);
    583       snprintf (format,
    584                 sizeof (format),
    585                 "%ux%u",
    586                 (unsigned int) avi.width,
    587                 (unsigned int) avi.height);
    588       ADD (format,
    589            EXTRACTOR_METATYPE_IMAGE_DIMENSIONS);
    590     }
    591   }
    592 }
    593 
    594 
    595 /* end of riff_extractor.c */