aboutsummaryrefslogtreecommitdiff
path: root/test/content.xml
blob: cba6ac3c29a0d71accdfba592d66cd117111250b (plain) (blame)
1
2
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE office:document-content PUBLIC "-//OpenOffice.org//DTD OfficeDocument 1.0//EN" "office.dtd"><office:document-content xmlns:office="http://openoffice.org/2000/office" xmlns:style="http://openoffice.org/2000/style" xmlns:text="http://openoffice.org/2000/text" xmlns:table="http://openoffice.org/2000/table" xmlns:draw="http://openoffice.org/2000/drawing" xmlns:fo="http://www.w3.org/1999/XSL/Format" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:number="http://openoffice.org/2000/datastyle" xmlns:svg="http://www.w3.org/2000/svg" xmlns:chart="http://openoffice.org/2000/chart" xmlns:dr3d="http://openoffice.org/2000/dr3d" xmlns:math="http://www.w3.org/1998/Math/MathML" xmlns:form="http://openoffice.org/2000/form" xmlns:script="http://openoffice.org/2000/script" office:class="text" office:version="1.0"><office:script/><office:font-decls><style:font-decl style:name="Lucidasans1" fo:font-family="Lucidasans"/><style:font-decl style:name="Bitstream Vera Sans Mono" fo:font-family="&apos;Bitstream Vera Sans Mono&apos;" style:font-family-generic="modern" style:font-pitch="fixed"/><style:font-decl style:name="Courier" fo:font-family="Courier" style:font-family-generic="modern" style:font-pitch="fixed"/><style:font-decl style:name="Bitstream Vera Sans" fo:font-family="&apos;Bitstream Vera Sans&apos;" style:font-pitch="variable"/><style:font-decl style:name="Lucidasans" fo:font-family="Lucidasans" style:font-pitch="variable"/><style:font-decl style:name="Mincho" fo:font-family="Mincho" style:font-pitch="variable"/><style:font-decl style:name="Times New Roman" fo:font-family="&apos;Times New Roman&apos;" style:font-family-generic="roman" style:font-pitch="variable"/><style:font-decl style:name="Arial" fo:font-family="Arial" style:font-family-generic="swiss" style:font-pitch="variable"/></office:font-decls><office:automatic-styles><style:style style:name="P1" style:family="paragraph" style:parent-style-name="Standard"><style:properties fo:font-weight="bold" style:font-weight-asian="bold" style:font-weight-complex="bold"/></style:style><style:style style:name="P2" style:family="paragraph" style:parent-style-name="Preformatted Text"><style:properties style:font-name="Courier" fo:font-size="9pt" fo:font-weight="bold" style:font-size-asian="9pt" style:font-weight-asian="bold" style:font-size-complex="9pt" style:font-weight-complex="bold"/></style:style><style:style style:name="P3" style:family="paragraph" style:parent-style-name="Preformatted Text"><style:properties style:font-name="Courier" fo:font-size="9pt" style:font-size-asian="9pt" style:font-size-complex="9pt"/></style:style><style:style style:name="T1" style:family="text"><style:properties style:font-name="Courier" fo:font-weight="bold" style:font-weight-asian="bold" style:font-weight-complex="bold"/></style:style><style:style style:name="T2" style:family="text"><style:properties style:font-name="Courier"/></style:style><style:style style:name="T3" style:family="text"><style:properties style:font-name="Courier" fo:font-weight="normal" style:font-weight-asian="normal" style:font-weight-complex="normal"/></style:style><style:style style:name="T4" style:family="text"><style:properties style:font-name="Courier" fo:font-style="italic" style:font-style-asian="italic" style:font-style-complex="italic"/></style:style><style:style style:name="T5" style:family="text"><style:properties fo:font-weight="bold" style:font-weight-asian="bold" style:font-weight-complex="bold"/></style:style><style:style style:name="T6" style:family="text"><style:properties fo:font-style="normal" fo:font-weight="bold" style:font-style-asian="normal" style:font-weight-asian="bold" style:font-style-complex="normal" style:font-weight-complex="bold"/></style:style><style:style style:name="T7" style:family="text"><style:properties fo:font-style="italic" style:font-style-asian="italic" style:font-style-complex="italic"/></style:style></office:automatic-styles><office:body><text:sequence-decls><text:sequence-decl text:display-outline-level="0" text:name="Illustration"/><text:sequence-decl text:display-outline-level="0" text:name="Table"/><text:sequence-decl text:display-outline-level="0" text:name="Text"/><text:sequence-decl text:display-outline-level="0" text:name="Drawing"/></text:sequence-decls><text:p text:style-name="Heading"><text:title/>NAME</text:p><text:p text:style-name="Standard">extract - determine meta-information about a file</text:p><text:p text:style-name="Heading">SYNOPSIS</text:p><text:p text:style-name="Preformatted Text"><text:span text:style-name="T1">extract </text:span><text:span text:style-name="T2">[</text:span><text:span text:style-name="T1">-abdfhLnrsvV</text:span><text:span text:style-name="T2">] </text:span><text:span text:style-name="T3">[</text:span><text:span text:style-name="T1">-B</text:span><text:span text:style-name="T2"> </text:span><text:span text:style-name="T4">language</text:span><text:span text:style-name="T2">][</text:span><text:span text:style-name="T1">-H</text:span><text:span text:style-name="T2"> </text:span><text:span text:style-name="T4">hash-algorithm</text:span><text:span text:style-name="T2">][</text:span><text:span text:style-name="T1">-l</text:span><text:span text:style-name="T2"> </text:span><text:span text:style-name="T4">library</text:span><text:span text:style-name="T2">][</text:span><text:span text:style-name="T1">-p </text:span><text:span text:style-name="T4">type</text:span><text:span text:style-name="T2">]<text:line-break/> <text:s text:c="3"/>[</text:span><text:span text:style-name="T1">-x</text:span><text:span text:style-name="T2"> </text:span><text:span text:style-name="T4">type</text:span><text:span text:style-name="T2">] </text:span><text:span text:style-name="T4">file </text:span><text:span text:style-name="T2">...</text:span></text:p><text:p text:style-name="Heading">DESCRIPTION</text:p><text:p text:style-name="Standard">This manual page documents version 0.4.0 of the <text:span text:style-name="T5">extract</text:span> command.</text:p><text:p text:style-name="Preformatted Text"/><text:p text:style-name="Standard"><text:span text:style-name="T6">extract</text:span> tests each file specified in the argument list in an attempt to infer meta-information from it. <text:s/>Each file is subjected to the meta-data extraction libraries from <text:span text:style-name="T6">libextractor</text:span>. </text:p><text:p text:style-name="Preformatted Text"/><text:p text:style-name="Standard"><text:span text:style-name="T5">libextractor</text:span> classifies meta-information (also referred to as keywords) into types. A list of all types can be obtained with the <text:span text:style-name="T5">-L</text:span> option.</text:p><text:p text:style-name="Heading">OPTIONS</text:p><text:p text:style-name="P1">-a</text:p><text:p text:style-name="First line indent">Do not remove any duplicates, even if the keywords match exactly and have the same type (i.e. because the same keyword was found by different extractor libraries).</text:p><text:p text:style-name="P1">-b</text:p><text:p text:style-name="First line indent">Display the output in BiBTeX format. This implies the <text:span text:style-name="T5">-d</text:span> option.</text:p><text:p text:style-name="Standard"><text:span text:style-name="T5">-B</text:span> <text:span text:style-name="T7">LANG</text:span></text:p><text:p text:style-name="First line indent">Use the generic plaintext extractor for the language with the 2-letter language code <text:span text:style-name="T7">LANG</text:span>. <text:s/>Supported languages are DA (Danish), DE (German), EN (English), ES (Spanish), IT (Italian) and NO (Norwegian).</text:p><text:p text:style-name="P1">-d</text:p><text:p text:style-name="First line indent">Remove duplicates only if the types match exactly. By default, duplicates are removed if the types match or if one of the types is <text:span text:style-name="T7">unknown</text:span> (in this case, the duplicate of unknown type is removed).</text:p><text:p text:style-name="Standard"><text:span text:style-name="T5">-f</text:span></text:p><text:p text:style-name="First line indent">add the filename(s) (without directory) to the list of keywords.</text:p><text:p text:style-name="Standard"><text:span text:style-name="T5">-h</text:span></text:p><text:p text:style-name="First line indent">Print a brief summary of the options.</text:p><text:p text:style-name="Standard"><text:span text:style-name="T5">-H</text:span> <text:span text:style-name="T7">ALGORITHM</text:span></text:p><text:p text:style-name="First line indent">Use the <text:span text:style-name="T7">ALGORITHM</text:span> to compute a hash of each file (possible algorithms are sha1 and md5).</text:p><text:p text:style-name="P1">-L</text:p><text:p text:style-name="First line indent">Print a list of all known keyword types.</text:p><text:p text:style-name="P1">-n</text:p><text:p text:style-name="First line indent">Do not use the default set of extractors (typically all standard extractors, currently mp3, ogg, jpg, gif, png, tiff, real, html, pdf and mime-types), use only the extractors specified with the <text:span text:style-name="T5">-l</text:span> option.</text:p><text:p text:style-name="P1">-r</text:p><text:p text:style-name="First line indent">Remove all duplicates disregarding differences in the keyword type.</text:p><text:p text:style-name="P1">-s</text:p><text:p text:style-name="First line indent">Split keywords at delimiters (space, comma, colon, etc.) and list split keywords to be of <text:s/><text:span text:style-name="T7">unknown</text:span> type. This can also be done by loading the split-library. Using this option guarantees that the splitting is performed after all other libraries have been run. It is always performed before duplicate elimination.</text:p><text:p text:style-name="P1">-v</text:p><text:p text:style-name="First line indent">Print the version number and exit.</text:p><text:p text:style-name="P1">-V</text:p><text:p text:style-name="First line indent">Be verbose.</text:p><text:p text:style-name="P1">-B</text:p><text:p text:style-name="First line indent">Run the printable extractor (costly, generic extractor for binaries)</text:p><text:p text:style-name="Standard"><text:span text:style-name="T5">-l</text:span> <text:span text:style-name="T7">libraries</text:span></text:p><text:p text:style-name="First line indent">Use the specified <text:span text:style-name="T7">libraries</text:span> to extract keywords. The general format of libraries is <text:span text:style-name="T7">[[-LIBRARYNAME[:[-]LIBRARYNAME]*]</text:span> where <text:span text:style-name="T7">LIBRARYNAME</text:span> is a libextractor compatible library and typically of the form <text:span text:style-name="T7">libextractor_jpeg.so</text:span>. The minus before the libraryname indicates that this library should be run after all the libraries that were specified so far. If the minus is missing, the library is run before all previously specified libraries. </text:p><text:p text:style-name="Standard"><text:span text:style-name="T5">-p</text:span> <text:span text:style-name="T7">type</text:span></text:p><text:p text:style-name="First line indent">Print only the keywords matching the specified <text:span text:style-name="T7">type</text:span>. By default, all keywords that are found and not removed as duplicates are printed.</text:p><text:p text:style-name="Standard"><text:span text:style-name="T5">-x</text:span> <text:span text:style-name="T7">type</text:span></text:p><text:p text:style-name="First line indent">Exclude keywords of the specified <text:span text:style-name="T7">type</text:span> from the output. By default, all keywords that are found and not removed as duplicates are printed.</text:p><text:p text:style-name="Heading">SEE ALSO</text:p><text:p text:style-name="Standard">libextractor (3) - description of the libextractor library</text:p><text:p text:style-name="Heading">EXAMPLES</text:p><text:p text:style-name="P2">$ extract test/test.jpg</text:p><text:p text:style-name="P3">comment - (C) 2001 by Christian Grothoff, using gimp 1.2 1</text:p><text:p text:style-name="P3">mimetype - image/jpeg</text:p><text:p text:style-name="P3"/><text:p text:style-name="P2">$ extract -Vf -x comment test/test.jpg</text:p><text:p text:style-name="P3">Keywords for file test/test.jpg:</text:p><text:p text:style-name="P3">mimetype - image/jpeg</text:p><text:p text:style-name="P3">filename - test.jpg</text:p><text:p text:style-name="P3"/><text:p text:style-name="P2">$ extract -p comment test/test.jpg</text:p><text:p text:style-name="P3">comment - (C) 2001 by Christian Grothoff, using gimp 1.2 1</text:p><text:p text:style-name="P3"/><text:p text:style-name="P2">$ extract -nV -l libextractor_png.so -p comment test/test.jpg test/test.png</text:p><text:p text:style-name="P3">Keywords for file test/test.jpg:</text:p><text:p text:style-name="P3">Keywords for file test/test.png:</text:p><text:p text:style-name="P3">comment - Testing keyword extraction</text:p><text:p text:style-name="Heading">LEGAL NOTICE</text:p><text:p text:style-name="Standard">libextractor and the extract tool are released under the GPL.</text:p><text:p text:style-name="Heading">BUGS</text:p><text:p text:style-name="Standard">A couple of file-formats (on the order of 10^3) are not recognized...</text:p><text:p text:style-name="Heading">AUTHORS</text:p><text:p text:style-name="Standard"><text:span text:style-name="T5">extract</text:span> was originally written by Christian Grothoff &lt;christian@grothoff.org&gt; and Vidyut Samanta &lt;vids@cs.ucla.edu&gt;. Use &lt;<text:a xlink:type="simple" xlink:href="mailto:libextractor@cs.purdue.edu">libextractor@cs.purdue.edu</text:a>&gt; to contact the current maintainer(s).</text:p><text:p text:style-name="Heading">AVAILABILITY</text:p><text:p text:style-name="Standard">You can obtain the original author&apos;s latest version from <text:a xlink:type="simple" xlink:href="http://ovmj.org/libextractor/">http://ovmj.org/libextractor/</text:a>.</text:p></office:body></office:document-content>