riff_extractor.c (15405B)
1 /* 2 This file is part of libextractor. 3 Copyright (C) 2004, 2009, 2012, 2025 Vidyut Samanta and Christian Grothoff 4 5 libextractor is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published 7 by the Free Software Foundation; either version 3, or (at your 8 option) any later version. 9 10 libextractor is distributed in the hope that it will be useful, but 11 WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with libextractor; see the file COPYING. If not, write to the 17 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 18 Boston, MA 02110-1301, USA. 19 20 This code was based on AVInfo 1.0 alpha 11 21 (c) George Shuklin, gs]AT[shounen.ru, 2002-2004 22 http://shounen.ru/soft/avinfo/ 23 24 and bitcollider 0.6.0 25 (PD) 2004 The Bitzi Corporation 26 http://bitzi.com/ 27 */ 28 /** 29 * @file plugins/riff_extractor.c 30 * @brief plugin to support RIFF files (AVI, ANI, and others) 31 * @author Christian Grothoff 32 * 33 * RIFF structure: 34 * "RIFF" (4 bytes) 35 * file-size - 8 (4 bytes, little-endian) 36 * form type (4 bytes): "AVI ", "WAVE", "ANI ", etc. 37 * chunks... 38 * 39 * Each chunk: 40 * chunk ID (4 bytes) 41 * data size (4 bytes, little-endian, excludes the 8-byte header) 42 * data (size bytes, padded to even offset) 43 * 44 * LIST chunks carry a 4-byte list type immediately after their size, 45 * followed by sub-chunks. LIST INFO is globally standardised and 46 * appears in all RIFF-based formats. 47 */ 48 #include "platform.h" 49 #include "extractor.h" 50 #include <math.h> 51 52 53 /** 54 * Read a little-endian uint32 from @a data. 55 */ 56 static uint32_t 57 fread_le (const char *data) 58 { 59 uint32_t result = 0; 60 61 for (unsigned int x = 0; x < 4; x++) 62 result |= ((unsigned char) data[x]) << (x * 8); 63 return result; 64 } 65 66 67 /** 68 * Round @a num to the nearest integer (avoids depending on C99 round()). 69 */ 70 static double 71 round_double (double num) 72 { 73 return floor (num + 0.5); 74 } 75 76 77 /** 78 * Emit a UTF-8 string as metadata of the given type. 79 * Returns from the calling function if proc signals abort. 80 */ 81 #define ADD(s, t) do { \ 82 if (0 != ec->proc (ec->cls, "riff", (t), \ 83 EXTRACTOR_METAFORMAT_UTF8, \ 84 "text/plain", (s), strlen (s) + 1)) \ 85 return; \ 86 } while (0) 87 88 89 /** 90 * Maximum bytes we read from a single LIST INFO sub-chunk value. 91 */ 92 #define INFO_VALUE_MAX 1024 93 94 /** 95 * Maximum number of chunks we scan at each nesting level to guard 96 * against malformed files. 97 */ 98 #define MAX_CHUNKS 512 99 100 101 /** 102 * Mapping from a LIST INFO four-CC to a libextractor meta type. 103 */ 104 struct InfoTag 105 { 106 char id[4]; 107 enum EXTRACTOR_MetaType type; 108 }; 109 110 static const struct InfoTag INFO_TAGS[] = { 111 { "INAM", EXTRACTOR_METATYPE_TITLE }, 112 { "IART", EXTRACTOR_METATYPE_ARTIST }, 113 { "ICOP", EXTRACTOR_METATYPE_COPYRIGHT }, 114 { "ICRD", EXTRACTOR_METATYPE_CREATION_DATE }, 115 { "IGNR", EXTRACTOR_METATYPE_GENRE }, 116 { "IKEY", EXTRACTOR_METATYPE_KEYWORDS }, 117 { "ISFT", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE }, 118 { "ICMT", EXTRACTOR_METATYPE_COMMENT }, 119 { "ISRC", EXTRACTOR_METATYPE_SOURCE }, 120 { "ISBJ", EXTRACTOR_METATYPE_SUBJECT }, 121 { "ITRK", EXTRACTOR_METATYPE_TRACK_NUMBER }, 122 { "IPRD", EXTRACTOR_METATYPE_ALBUM }, 123 { "ILNG", EXTRACTOR_METATYPE_LANGUAGE }, 124 }; 125 126 127 /** 128 * Video metadata accumulated from an AVI LIST hdrl. 129 */ 130 struct AviState 131 { 132 uint32_t us_per_frame; /* dwMicroSecPerFrame from avih */ 133 uint32_t total_frames; /* dwTotalFrames from avih */ 134 uint32_t width; /* dwWidth from avih */ 135 uint32_t height; /* dwHeight from avih */ 136 char codec[5]; /* fccHandler from first vids strh, NUL-terminated */ 137 int have_avih; 138 int have_codec; 139 }; 140 141 142 /** 143 * Seek to @a pos, then read @a want bytes into *@a data. 144 * 145 * @return bytes read, or -1 on seek failure 146 */ 147 static ssize_t 148 seek_and_read (struct EXTRACTOR_ExtractContext *ec, 149 uint64_t pos, 150 void **data, 151 size_t want) 152 { 153 if ((int64_t) pos != 154 ec->seek (ec->cls, 155 (int64_t) pos, 156 SEEK_SET)) 157 return -1; 158 return ec->read (ec->cls, 159 data, 160 want); 161 } 162 163 164 /** 165 * Parse sub-chunks of a LIST INFO chunk and emit all recognised tags. 166 * 167 * @param ec extraction context 168 * @param start file offset of the first sub-chunk (immediately after list type) 169 * @param end file offset one past the last byte of the enclosing LIST chunk 170 * @return 0 to continue, 1 if proc signalled abort 171 */ 172 static int 173 parse_list_info (struct EXTRACTOR_ExtractContext *ec, 174 uint64_t start, 175 uint64_t end) 176 { 177 uint64_t pos = start; 178 179 for (unsigned int n = 0; (pos + 8 <= end) && (n < MAX_CHUNKS); n++) 180 { 181 void *data; 182 ssize_t got; 183 char id[4]; 184 uint32_t csz; 185 186 got = seek_and_read (ec, 187 pos, 188 &data, 189 8); 190 if (got < 8) 191 break; 192 memcpy (id, 193 data, 194 4); 195 csz = fread_le ((const char *) data + 4); 196 if ( (pos + 8 + csz > end) || 197 (pos + 8 + csz < pos) ) 198 { 199 /* terminate if chunk overflows the list */ 200 break; 201 } 202 if (0 == csz) 203 { 204 /* skip empty chunk */ 205 pos += 8; 206 continue; 207 } 208 209 for (unsigned int i = 0; 210 i < sizeof (INFO_TAGS) / sizeof (INFO_TAGS[0]); 211 i++) 212 { 213 size_t rlen = (csz < INFO_VALUE_MAX) ? (size_t) csz : INFO_VALUE_MAX; 214 char buf[INFO_VALUE_MAX + 1]; 215 size_t slen; 216 217 if (0 != 218 memcmp (id, 219 INFO_TAGS[i].id, 220 4)) 221 continue; 222 got = seek_and_read (ec, 223 pos + 8, 224 &data, 225 rlen); 226 if (got <= 0) 227 break; 228 slen = (size_t) got; 229 memcpy (buf, 230 data, 231 slen); 232 /* strip trailing NULs and spaces that some encoders pad with */ 233 while ( (slen > 0) && 234 ( ('\0' == buf[slen - 1]) || 235 (' ' == buf[slen - 1]) ) ) 236 slen--; 237 if (0 == slen) 238 break; 239 buf[slen] = '\0'; 240 if (0 != ec->proc (ec->cls, 241 "riff", 242 INFO_TAGS[i].type, 243 EXTRACTOR_METAFORMAT_UTF8, 244 "text/plain", 245 buf, 246 slen + 1)) 247 return 1; 248 break; 249 } 250 pos += 8 + csz + (csz & 1); 251 } 252 return 0; 253 } 254 255 256 /** 257 * Parse sub-chunks of a LIST strl; extract the codec fourcc from the 258 * first video stream header found. 259 * 260 * @param ec extraction context 261 * @param start file offset of the first sub-chunk 262 * @param end file offset one past the last byte of the strl LIST 263 * @param state AVI state to update 264 */ 265 static void 266 parse_strl (struct EXTRACTOR_ExtractContext *ec, 267 uint64_t start, 268 uint64_t end, 269 struct AviState *state) 270 { 271 uint64_t pos = start; 272 273 for (unsigned int n = 0; (pos + 8 <= end) && (n < MAX_CHUNKS); n++) 274 { 275 void *data; 276 ssize_t got; 277 char id[4]; 278 uint32_t csz; 279 280 got = seek_and_read (ec, 281 pos, 282 &data, 283 8); 284 if (got < 8) 285 break; 286 memcpy (id, 287 data, 288 4); 289 csz = fread_le ((const char *) data + 4); 290 291 if (! state->have_codec && 292 (0 == memcmp (id, 293 "strh", 294 4)) && 295 (csz >= 8) && 296 (pos + 8 + csz <= end)) 297 { 298 /* strh layout: fccType[4] fccHandler[4] ... */ 299 got = seek_and_read (ec, 300 pos + 8, 301 &data, 302 8); 303 if ( (got >= 8) && 304 (0 == memcmp (data, 305 "vids", 306 4))) 307 { 308 memcpy (state->codec, 309 (const char *) data + 4, 310 4); 311 state->codec[4] = '\0'; 312 state->have_codec = 1; 313 } 314 } 315 316 if (0 == csz) 317 break; 318 pos += 8 + csz + (csz & 1); 319 } 320 } 321 322 323 /** 324 * Parse sub-chunks of LIST hdrl in an AVI file; fills @a state with 325 * frame timing, dimensions, and the video codec. 326 * 327 * @param ec extraction context 328 * @param start file offset of the first sub-chunk (after list type "hdrl") 329 * @param end file offset one past the last byte of the hdrl LIST 330 * @param state AVI state to update 331 */ 332 static void 333 parse_hdrl (struct EXTRACTOR_ExtractContext *ec, 334 uint64_t start, 335 uint64_t end, 336 struct AviState *state) 337 { 338 uint64_t pos = start; 339 340 for (unsigned int n = 0; (pos + 8 <= end) && (n < MAX_CHUNKS); n++) 341 { 342 void *data; 343 ssize_t got; 344 char id[4]; 345 uint32_t csz; 346 347 got = seek_and_read (ec, 348 pos, 349 &data, 350 8); 351 if (got < 8) 352 break; 353 memcpy (id, 354 data, 355 4); 356 csz = fread_le ((const char *) data + 4); 357 358 if (! state->have_avih && 359 (0 == memcmp (id, "avih", 4)) && 360 (csz >= 40) && 361 (pos + 8 + csz <= end)) 362 { 363 /* AVIMAINHEADER layout (all DWORDs, little-endian): 364 [0] dwMicroSecPerFrame 365 [4] dwMaxBytesPerSec 366 [8] dwPaddingGranularity 367 [12] dwFlags 368 [16] dwTotalFrames 369 [20] dwInitialFrames 370 [24] dwStreams 371 [28] dwSuggestedBufferSize 372 [32] dwWidth 373 [36] dwHeight */ 374 got = seek_and_read (ec, 375 pos + 8, 376 &data, 377 40); 378 if (got >= 40) 379 { 380 const char *d = data; 381 382 state->us_per_frame = fread_le (&d[0]); 383 state->total_frames = fread_le (&d[16]); 384 state->width = fread_le (&d[32]); 385 state->height = fread_le (&d[36]); 386 state->have_avih = 1; 387 } 388 } 389 else if ((0 == memcmp (id, 390 "LIST", 391 4)) && 392 (csz >= 4) && 393 (pos + 8 + csz <= end)) 394 { 395 got = seek_and_read (ec, 396 pos + 8, 397 &data, 398 4); 399 if ( (got >= 4) && 400 (0 == memcmp (data, 401 "strl", 402 4))) 403 parse_strl (ec, 404 pos + 12, 405 pos + 8 + csz, 406 state); 407 } 408 409 if (0 == csz) 410 break; 411 pos += 8 + csz + (csz & 1); 412 } 413 } 414 415 416 /** 417 * Main entry method for the RIFF extraction plugin. 418 * Handles any RIFF-based format; extracts LIST INFO tags universally 419 * and AVI video stream metadata for "AVI " files. 420 * 421 * @param ec extraction context provided to the plugin 422 */ 423 void 424 EXTRACTOR_riff_extract_method (struct EXTRACTOR_ExtractContext *ec); 425 426 void 427 EXTRACTOR_riff_extract_method (struct EXTRACTOR_ExtractContext *ec) 428 { 429 void *data; 430 ssize_t got; 431 char form_type[4]; 432 uint32_t riff_size; 433 uint64_t file_size; 434 uint64_t riff_end; 435 const char *mime; 436 int is_avi; 437 struct AviState avi; 438 439 /* need at least "RIFF" + size + form type */ 440 got = ec->read (ec->cls, 441 &data, 442 12); 443 if (got < 12) 444 return; 445 if (0 != memcmp (data, 446 "RIFF", 447 4)) 448 return; 449 450 riff_size = fread_le ((const char *) data + 4); 451 memcpy (form_type, 452 (const char *) data + 8, 453 4); 454 455 file_size = ec->get_size (ec->cls); 456 /* riff_size counts bytes after the 8-byte RIFF header */ 457 riff_end = (uint64_t) riff_size + 8; 458 if (riff_end > file_size) 459 riff_end = file_size; 460 461 /* map known form types to MIME strings */ 462 if (0 == memcmp (form_type, 463 "AVI ", 464 4)) 465 mime = "video/x-msvideo"; 466 else if ((0 == memcmp (form_type, 467 "ANI ", 468 4)) || 469 (0 == memcmp (form_type, 470 "ACON", 471 4))) 472 mime = "application/x-navi-animation"; 473 else if (0 == memcmp (form_type, 474 "RMID", 475 4)) 476 mime = "audio/midi"; 477 else 478 mime = NULL; /* unknown or handled by another plugin (e.g. WAVE) */ 479 480 if (NULL != mime) 481 ADD (mime, 482 EXTRACTOR_METATYPE_MIMETYPE); 483 484 is_avi = (0 == memcmp (form_type, 485 "AVI ", 486 4)); 487 memset (&avi, 488 0, 489 sizeof (avi)); 490 491 /* scan top-level chunks */ 492 { 493 uint64_t pos = 12; 494 495 for (unsigned int n = 0; (pos + 8 <= riff_end) && (n < MAX_CHUNKS); n++) 496 { 497 char id[4]; 498 uint32_t csz; 499 500 got = seek_and_read (ec, 501 pos, 502 &data, 503 8); 504 if (got < 8) 505 break; 506 memcpy (id, 507 data, 508 4); 509 csz = fread_le ((const char *) data + 4); 510 511 if (pos + 8 + (uint64_t) csz > riff_end) 512 break; /* chunk overflows the declared file size */ 513 514 if ( (0 == memcmp (id, 515 "LIST", 516 4)) && 517 (csz >= 4) ) 518 { 519 char list_type[4]; 520 521 got = seek_and_read (ec, 522 pos + 8, 523 &data, 524 4); 525 if (got >= 4) 526 { 527 memcpy (list_type, 528 data, 529 4); 530 531 if (0 == memcmp (list_type, 532 "INFO", 533 4)) 534 { 535 if (0 != parse_list_info (ec, 536 pos + 12, 537 pos + 8 + csz)) 538 return; 539 } 540 else if (is_avi && 541 (0 == memcmp (list_type, 542 "hdrl", 543 4))) 544 { 545 parse_hdrl (ec, 546 pos + 12, 547 pos + 8 + csz, 548 &avi); 549 } 550 } 551 } 552 553 if (0 == csz) 554 break; 555 pos += 8 + csz + (csz & 1); 556 } 557 } 558 559 /* emit AVI video metadata once we've scanned all chunks */ 560 if (is_avi && 561 avi.have_avih && 562 avi.have_codec && 563 (avi.us_per_frame > 0)) 564 { 565 unsigned int fps = 566 (unsigned int) round_double (1.0e6 / (double) avi.us_per_frame); 567 568 if (fps > 0) 569 { 570 unsigned int duration = 571 (unsigned int) round_double ((double) avi.total_frames * 1000.0 572 / (double) fps); 573 char format[256]; 574 575 snprintf (format, 576 sizeof (format), 577 _ ("codec: %s, %u fps, %u ms"), 578 avi.codec, 579 fps, 580 duration); 581 ADD (format, 582 EXTRACTOR_METATYPE_FORMAT); 583 snprintf (format, 584 sizeof (format), 585 "%ux%u", 586 (unsigned int) avi.width, 587 (unsigned int) avi.height); 588 ADD (format, 589 EXTRACTOR_METATYPE_IMAGE_DIMENSIONS); 590 } 591 } 592 } 593 594 595 /* end of riff_extractor.c */