commit 99d6ff78212e30b8b321ac52d0da11e4da99ea94
parent 3119b616921f4976264ddf6a9fd9accb2d61e334
Author: Christian Grothoff <grothoff@gnunet.org>
Date: Fri, 22 May 2026 23:30:39 +0200
fix up riff extractor
Diffstat:
1 file changed, 511 insertions(+), 83 deletions(-)
diff --git a/src/plugins/riff_extractor.c b/src/plugins/riff_extractor.c
@@ -1,6 +1,6 @@
/*
This file is part of libextractor.
- Copyright (C) 2004, 2009, 2012 Vidyut Samanta and Christian Grothoff
+ Copyright (C) 2004, 2009, 2012, 2025 Vidyut Samanta and Christian Grothoff
libextractor is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published
@@ -27,8 +27,23 @@
*/
/**
* @file plugins/riff_extractor.c
- * @brief plugin to support RIFF files (ms-video)
+ * @brief plugin to support RIFF files (AVI, ANI, and others)
* @author Christian Grothoff
+ *
+ * RIFF structure:
+ * "RIFF" (4 bytes)
+ * file-size - 8 (4 bytes, little-endian)
+ * form type (4 bytes): "AVI ", "WAVE", "ANI ", etc.
+ * chunks...
+ *
+ * Each chunk:
+ * chunk ID (4 bytes)
+ * data size (4 bytes, little-endian, excludes the 8-byte header)
+ * data (size bytes, padded to even offset)
+ *
+ * LIST chunks carry a 4-byte list type immediately after their size,
+ * followed by sub-chunks. LIST INFO is globally standardised and
+ * appears in all RIFF-based formats.
*/
#include "platform.h"
#include "extractor.h"
@@ -36,31 +51,21 @@
/**
- * Read an uint32_t as a little-endian (least
- * significant byte first) integer from @a data.
- *
- * @param data input data
- * @return integer read
+ * Read a little-endian uint32 from @a data.
*/
static uint32_t
fread_le (const char *data)
{
- unsigned int x;
uint32_t result = 0;
- for (x = 0; x < 4; x++)
+ for (unsigned int x = 0; x < 4; x++)
result |= ((unsigned char) data[x]) << (x * 8);
return result;
}
/**
- * We implement our own rounding function, because the availability of
- * C99's round(), nearbyint(), rint(), etc. seems to be spotty, whereas
- * floor() is available in math.h on all C compilers.
- *
- * @param num value to round
- * @return rounded-to-nearest value
+ * Round @a num to the nearest integer (avoids depending on C99 round()).
*/
static double
round_double (double num)
@@ -70,21 +75,348 @@ round_double (double num)
/**
- * Pass the given UTF-8 string to the 'proc' callback using
- * the given type. Uses 'return' if 'proc' returns non-0.
- *
- * @param s 0-terminated UTF8 string value with the meta data
- * @param t libextractor type for the meta data
+ * Emit a UTF-8 string as metadata of the given type.
+ * Returns from the calling function if proc signals abort.
*/
-#define ADD(s,t) do { if (0 != ec->proc (ec->cls, "riff", t, \
- EXTRACTOR_METAFORMAT_UTF8, \
- "text/plain", s, strlen (s) \
- + 1)) return; \
+#define ADD(s, t) do { \
+ if (0 != ec->proc (ec->cls, "riff", (t), \
+ EXTRACTOR_METAFORMAT_UTF8, \
+ "text/plain", (s), strlen (s) + 1)) \
+ return; \
} while (0)
/**
- * Main entry method for the 'video/x-msvideo' extraction plugin.
+ * Maximum bytes we read from a single LIST INFO sub-chunk value.
+ */
+#define INFO_VALUE_MAX 1024
+
+/**
+ * Maximum number of chunks we scan at each nesting level to guard
+ * against malformed files.
+ */
+#define MAX_CHUNKS 512
+
+
+/**
+ * Mapping from a LIST INFO four-CC to a libextractor meta type.
+ */
+struct InfoTag
+{
+ char id[4];
+ enum EXTRACTOR_MetaType type;
+};
+
+static const struct InfoTag INFO_TAGS[] = {
+ { "INAM", EXTRACTOR_METATYPE_TITLE },
+ { "IART", EXTRACTOR_METATYPE_ARTIST },
+ { "ICOP", EXTRACTOR_METATYPE_COPYRIGHT },
+ { "ICRD", EXTRACTOR_METATYPE_CREATION_DATE },
+ { "IGNR", EXTRACTOR_METATYPE_GENRE },
+ { "IKEY", EXTRACTOR_METATYPE_KEYWORDS },
+ { "ISFT", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE },
+ { "ICMT", EXTRACTOR_METATYPE_COMMENT },
+ { "ISRC", EXTRACTOR_METATYPE_SOURCE },
+ { "ISBJ", EXTRACTOR_METATYPE_SUBJECT },
+ { "ITRK", EXTRACTOR_METATYPE_TRACK_NUMBER },
+ { "IPRD", EXTRACTOR_METATYPE_ALBUM },
+ { "ILNG", EXTRACTOR_METATYPE_LANGUAGE },
+};
+
+
+/**
+ * Video metadata accumulated from an AVI LIST hdrl.
+ */
+struct AviState
+{
+ uint32_t us_per_frame; /* dwMicroSecPerFrame from avih */
+ uint32_t total_frames; /* dwTotalFrames from avih */
+ uint32_t width; /* dwWidth from avih */
+ uint32_t height; /* dwHeight from avih */
+ char codec[5]; /* fccHandler from first vids strh, NUL-terminated */
+ int have_avih;
+ int have_codec;
+};
+
+
+/**
+ * Seek to @a pos, then read @a want bytes into *@a data.
+ *
+ * @return bytes read, or -1 on seek failure
+ */
+static ssize_t
+seek_and_read (struct EXTRACTOR_ExtractContext *ec,
+ uint64_t pos,
+ void **data,
+ size_t want)
+{
+ if ((int64_t) pos !=
+ ec->seek (ec->cls,
+ (int64_t) pos,
+ SEEK_SET))
+ return -1;
+ return ec->read (ec->cls,
+ data,
+ want);
+}
+
+
+/**
+ * Parse sub-chunks of a LIST INFO chunk and emit all recognised tags.
+ *
+ * @param ec extraction context
+ * @param start file offset of the first sub-chunk (immediately after list type)
+ * @param end file offset one past the last byte of the enclosing LIST chunk
+ * @return 0 to continue, 1 if proc signalled abort
+ */
+static int
+parse_list_info (struct EXTRACTOR_ExtractContext *ec,
+ uint64_t start,
+ uint64_t end)
+{
+ uint64_t pos = start;
+
+ for (unsigned int n = 0; (pos + 8 <= end) && (n < MAX_CHUNKS); n++)
+ {
+ void *data;
+ ssize_t got;
+ char id[4];
+ uint32_t csz;
+
+ got = seek_and_read (ec,
+ pos,
+ &data,
+ 8);
+ if (got < 8)
+ break;
+ memcpy (id,
+ data,
+ 4);
+ csz = fread_le ((const char *) data + 4);
+ if ( (pos + 8 + csz > end) ||
+ (pos + 8 + csz < pos) )
+ {
+ /* terminate if chunk overflows the list */
+ break;
+ }
+ if (0 == csz)
+ {
+ /* skip empty chunk */
+ pos += 8;
+ continue;
+ }
+
+ for (unsigned int i = 0;
+ i < sizeof (INFO_TAGS) / sizeof (INFO_TAGS[0]);
+ i++)
+ {
+ size_t rlen = (csz < INFO_VALUE_MAX) ? (size_t) csz : INFO_VALUE_MAX;
+ char buf[INFO_VALUE_MAX + 1];
+ size_t slen;
+
+ if (0 !=
+ memcmp (id,
+ INFO_TAGS[i].id,
+ 4))
+ continue;
+ got = seek_and_read (ec,
+ pos + 8,
+ &data,
+ rlen);
+ if (got <= 0)
+ break;
+ slen = (size_t) got;
+ memcpy (buf,
+ data,
+ slen);
+ /* strip trailing NULs and spaces that some encoders pad with */
+ while ( (slen > 0) &&
+ ( ('\0' == buf[slen - 1]) ||
+ (' ' == buf[slen - 1]) ) )
+ slen--;
+ if (0 == slen)
+ break;
+ buf[slen] = '\0';
+ if (0 != ec->proc (ec->cls,
+ "riff",
+ INFO_TAGS[i].type,
+ EXTRACTOR_METAFORMAT_UTF8,
+ "text/plain",
+ buf,
+ slen + 1))
+ return 1;
+ break;
+ }
+ pos += 8 + csz + (csz & 1);
+ }
+ return 0;
+}
+
+
+/**
+ * Parse sub-chunks of a LIST strl; extract the codec fourcc from the
+ * first video stream header found.
+ *
+ * @param ec extraction context
+ * @param start file offset of the first sub-chunk
+ * @param end file offset one past the last byte of the strl LIST
+ * @param state AVI state to update
+ */
+static void
+parse_strl (struct EXTRACTOR_ExtractContext *ec,
+ uint64_t start,
+ uint64_t end,
+ struct AviState *state)
+{
+ uint64_t pos = start;
+
+ for (unsigned int n = 0; (pos + 8 <= end) && (n < MAX_CHUNKS); n++)
+ {
+ void *data;
+ ssize_t got;
+ char id[4];
+ uint32_t csz;
+
+ got = seek_and_read (ec,
+ pos,
+ &data,
+ 8);
+ if (got < 8)
+ break;
+ memcpy (id,
+ data,
+ 4);
+ csz = fread_le ((const char *) data + 4);
+
+ if (! state->have_codec &&
+ (0 == memcmp (id,
+ "strh",
+ 4)) &&
+ (csz >= 8) &&
+ (pos + 8 + csz <= end))
+ {
+ /* strh layout: fccType[4] fccHandler[4] ... */
+ got = seek_and_read (ec,
+ pos + 8,
+ &data,
+ 8);
+ if ( (got >= 8) &&
+ (0 == memcmp (data,
+ "vids",
+ 4)))
+ {
+ memcpy (state->codec,
+ (const char *) data + 4,
+ 4);
+ state->codec[4] = '\0';
+ state->have_codec = 1;
+ }
+ }
+
+ if (0 == csz)
+ break;
+ pos += 8 + csz + (csz & 1);
+ }
+}
+
+
+/**
+ * Parse sub-chunks of LIST hdrl in an AVI file; fills @a state with
+ * frame timing, dimensions, and the video codec.
+ *
+ * @param ec extraction context
+ * @param start file offset of the first sub-chunk (after list type "hdrl")
+ * @param end file offset one past the last byte of the hdrl LIST
+ * @param state AVI state to update
+ */
+static void
+parse_hdrl (struct EXTRACTOR_ExtractContext *ec,
+ uint64_t start,
+ uint64_t end,
+ struct AviState *state)
+{
+ uint64_t pos = start;
+
+ for (unsigned int n = 0; (pos + 8 <= end) && (n < MAX_CHUNKS); n++)
+ {
+ void *data;
+ ssize_t got;
+ char id[4];
+ uint32_t csz;
+
+ got = seek_and_read (ec,
+ pos,
+ &data,
+ 8);
+ if (got < 8)
+ break;
+ memcpy (id,
+ data,
+ 4);
+ csz = fread_le ((const char *) data + 4);
+
+ if (! state->have_avih &&
+ (0 == memcmp (id, "avih", 4)) &&
+ (csz >= 40) &&
+ (pos + 8 + csz <= end))
+ {
+ /* AVIMAINHEADER layout (all DWORDs, little-endian):
+ [0] dwMicroSecPerFrame
+ [4] dwMaxBytesPerSec
+ [8] dwPaddingGranularity
+ [12] dwFlags
+ [16] dwTotalFrames
+ [20] dwInitialFrames
+ [24] dwStreams
+ [28] dwSuggestedBufferSize
+ [32] dwWidth
+ [36] dwHeight */
+ got = seek_and_read (ec,
+ pos + 8,
+ &data,
+ 40);
+ if (got >= 40)
+ {
+ const char *d = data;
+
+ state->us_per_frame = fread_le (&d[0]);
+ state->total_frames = fread_le (&d[16]);
+ state->width = fread_le (&d[32]);
+ state->height = fread_le (&d[36]);
+ state->have_avih = 1;
+ }
+ }
+ else if ((0 == memcmp (id,
+ "LIST",
+ 4)) &&
+ (csz >= 4) &&
+ (pos + 8 + csz <= end))
+ {
+ got = seek_and_read (ec,
+ pos + 8,
+ &data,
+ 4);
+ if ( (got >= 4) &&
+ (0 == memcmp (data,
+ "strl",
+ 4)))
+ parse_strl (ec,
+ pos + 12,
+ pos + 8 + csz,
+ state);
+ }
+
+ if (0 == csz)
+ break;
+ pos += 8 + csz + (csz & 1);
+ }
+}
+
+
+/**
+ * Main entry method for the RIFF extraction plugin.
+ * Handles any RIFF-based format; extracts LIST INFO tags universally
+ * and AVI video stream metadata for "AVI " files.
*
* @param ec extraction context provided to the plugin
*/
@@ -94,73 +426,169 @@ EXTRACTOR_riff_extract_method (struct EXTRACTOR_ExtractContext *ec);
void
EXTRACTOR_riff_extract_method (struct EXTRACTOR_ExtractContext *ec)
{
- ssize_t xsize;
void *data;
- char *xdata;
- uint32_t blockLen;
- unsigned int fps;
- unsigned int duration;
- uint64_t pos;
- uint32_t width;
- uint32_t height;
- char codec[5];
- char format[256];
-
- /* read header */
- if (72 > (xsize = ec->read (ec->cls, &data, 72)))
+ ssize_t got;
+ char form_type[4];
+ uint32_t riff_size;
+ uint64_t file_size;
+ uint64_t riff_end;
+ const char *mime;
+ int is_avi;
+ struct AviState avi;
+
+ /* need at least "RIFF" + size + form type */
+ got = ec->read (ec->cls,
+ &data,
+ 12);
+ if (got < 12)
return;
- xdata = data;
-
- /* check magic values */
- if ( (0 != memcmp (&xdata[0],
- "RIFF", 4)) ||
- (0 != memcmp (&xdata[8], "AVI ", 4)) ||
- (0 != memcmp (&xdata[12], "LIST", 4)) ||
- (0 != memcmp (&xdata[20], "hdrlavih", 8)) )
+ if (0 != memcmp (data,
+ "RIFF",
+ 4))
return;
- blockLen = fread_le (&xdata[28]);
+ riff_size = fread_le ((const char *) data + 4);
+ memcpy (form_type,
+ (const char *) data + 8,
+ 4);
- /* begin of AVI header at 32 */
- fps = (unsigned int) round_double ((double) 1.0e6 / fread_le (
- &xdata[32]));
- duration = (unsigned int) round_double ((double) fread_le (
- &xdata[48])
- * 1000 / fps);
- width = fread_le (&xdata[64]);
- height = fread_le (&xdata[68]);
+ file_size = ec->get_size (ec->cls);
+ /* riff_size counts bytes after the 8-byte RIFF header */
+ riff_end = (uint64_t) riff_size + 8;
+ if (riff_end > file_size)
+ riff_end = file_size;
- /* pos: begin of video stream header */
- pos = blockLen + 32;
+ /* map known form types to MIME strings */
+ if (0 == memcmp (form_type,
+ "AVI ",
+ 4))
+ mime = "video/x-msvideo";
+ else if ((0 == memcmp (form_type,
+ "ANI ",
+ 4)) ||
+ (0 == memcmp (form_type,
+ "ACON",
+ 4)))
+ mime = "application/x-navi-animation";
+ else if (0 == memcmp (form_type,
+ "RMID",
+ 4))
+ mime = "audio/midi";
+ else
+ mime = NULL; /* unknown or handled by another plugin (e.g. WAVE) */
- if (pos !=
- ec->seek (ec->cls, pos, SEEK_SET))
- return;
- if (32 > ec->read (ec->cls, &data, 32))
- return;
- xdata = data;
+ if (NULL != mime)
+ ADD (mime,
+ EXTRACTOR_METATYPE_MIMETYPE);
- /* check magic */
- if ( (0 != memcmp (xdata, "LIST", 4)) ||
- (0 != memcmp (&xdata[8], "strlstrh", 8)) ||
- (0 != memcmp (&xdata[20], "vids", 4)) )
- return;
+ is_avi = (0 == memcmp (form_type,
+ "AVI ",
+ 4));
+ memset (&avi,
+ 0,
+ sizeof (avi));
+
+ /* scan top-level chunks */
+ {
+ uint64_t pos = 12;
+
+ for (unsigned int n = 0; (pos + 8 <= riff_end) && (n < MAX_CHUNKS); n++)
+ {
+ char id[4];
+ uint32_t csz;
+
+ got = seek_and_read (ec,
+ pos,
+ &data,
+ 8);
+ if (got < 8)
+ break;
+ memcpy (id,
+ data,
+ 4);
+ csz = fread_le ((const char *) data + 4);
+
+ if (pos + 8 + (uint64_t) csz > riff_end)
+ break; /* chunk overflows the declared file size */
+
+ if ( (0 == memcmp (id,
+ "LIST",
+ 4)) &&
+ (csz >= 4) )
+ {
+ char list_type[4];
+
+ got = seek_and_read (ec,
+ pos + 8,
+ &data,
+ 4);
+ if (got >= 4)
+ {
+ memcpy (list_type,
+ data,
+ 4);
+
+ if (0 == memcmp (list_type,
+ "INFO",
+ 4))
+ {
+ if (0 != parse_list_info (ec,
+ pos + 12,
+ pos + 8 + csz))
+ return;
+ }
+ else if (is_avi &&
+ (0 == memcmp (list_type,
+ "hdrl",
+ 4)))
+ {
+ parse_hdrl (ec,
+ pos + 12,
+ pos + 8 + csz,
+ &avi);
+ }
+ }
+ }
+
+ if (0 == csz)
+ break;
+ pos += 8 + csz + (csz & 1);
+ }
+ }
+
+ /* emit AVI video metadata once we've scanned all chunks */
+ if (is_avi &&
+ avi.have_avih &&
+ avi.have_codec &&
+ (avi.us_per_frame > 0))
+ {
+ unsigned int fps =
+ (unsigned int) round_double (1.0e6 / (double) avi.us_per_frame);
+
+ if (fps > 0)
+ {
+ unsigned int duration =
+ (unsigned int) round_double ((double) avi.total_frames * 1000.0
+ / (double) fps);
+ char format[256];
- /* pos + 24: video stream header with codec */
- memcpy (codec, &xdata[24], 4);
- codec[4] = '\0';
- snprintf (format,
- sizeof (format),
- _ ("codec: %s, %u fps, %u ms"),
- codec, fps, duration);
- ADD (format, EXTRACTOR_METATYPE_FORMAT);
- snprintf (format,
- sizeof (format),
- "%ux%u",
- (unsigned int) width,
- (unsigned int) height);
- ADD (format, EXTRACTOR_METATYPE_IMAGE_DIMENSIONS);
- ADD ("video/x-msvideo", EXTRACTOR_METATYPE_MIMETYPE);
+ snprintf (format,
+ sizeof (format),
+ _ ("codec: %s, %u fps, %u ms"),
+ avi.codec,
+ fps,
+ duration);
+ ADD (format,
+ EXTRACTOR_METATYPE_FORMAT);
+ snprintf (format,
+ sizeof (format),
+ "%ux%u",
+ (unsigned int) avi.width,
+ (unsigned int) avi.height);
+ ADD (format,
+ EXTRACTOR_METATYPE_IMAGE_DIMENSIONS);
+ }
+ }
}