aboutsummaryrefslogtreecommitdiff
path: root/test
diff options
context:
space:
mode:
authorChristian Grothoff <christian@grothoff.org>2005-08-14 01:17:17 +0000
committerChristian Grothoff <christian@grothoff.org>2005-08-14 01:17:17 +0000
commitc244d85975791e3f23243a9ef4bafbc2029c49e7 (patch)
treeedb6f94ce0b766c4ab0c2a499ac544ef115926b5 /test
parentaa0b93c1cc173b5b621623af1797a0e9b85c15b6 (diff)
downloadlibextractor-c244d85975791e3f23243a9ef4bafbc2029c49e7.tar.gz
libextractor-c244d85975791e3f23243a9ef4bafbc2029c49e7.zip
more tests
Diffstat (limited to 'test')
-rw-r--r--test/content.xml2
-rw-r--r--test/diacritics-msword10-winnt.docbin0 -> 24064 bytes
-rw-r--r--test/extract-msword10-winnt.docbin0 -> 32768 bytes
-rw-r--r--test/extract-msword6-msdos.docbin0 -> 8320 bytes
-rw-r--r--test/extract-msword6-win95.docbin0 -> 15360 bytes
-rw-r--r--test/extract-msword8-winnt.docbin0 -> 28160 bytes
-rw-r--r--test/extract-msword97-winnt.rtf116
-rw-r--r--test/extract.pdfbin0 -> 99512 bytes
-rw-r--r--test/extract.sxwbin0 -> 8270 bytes
-rw-r--r--test/meta.xml2
-rw-r--r--test/settings.xml2
-rw-r--r--test/styles.xml2
12 files changed, 124 insertions, 0 deletions
diff --git a/test/content.xml b/test/content.xml
new file mode 100644
index 0000000..cba6ac3
--- /dev/null
+++ b/test/content.xml
@@ -0,0 +1,2 @@
1<?xml version="1.0" encoding="UTF-8"?>
2<!DOCTYPE office:document-content PUBLIC "-//OpenOffice.org//DTD OfficeDocument 1.0//EN" "office.dtd"><office:document-content xmlns:office="http://openoffice.org/2000/office" xmlns:style="http://openoffice.org/2000/style" xmlns:text="http://openoffice.org/2000/text" xmlns:table="http://openoffice.org/2000/table" xmlns:draw="http://openoffice.org/2000/drawing" xmlns:fo="http://www.w3.org/1999/XSL/Format" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:number="http://openoffice.org/2000/datastyle" xmlns:svg="http://www.w3.org/2000/svg" xmlns:chart="http://openoffice.org/2000/chart" xmlns:dr3d="http://openoffice.org/2000/dr3d" xmlns:math="http://www.w3.org/1998/Math/MathML" xmlns:form="http://openoffice.org/2000/form" xmlns:script="http://openoffice.org/2000/script" office:class="text" office:version="1.0"><office:script/><office:font-decls><style:font-decl style:name="Lucidasans1" fo:font-family="Lucidasans"/><style:font-decl style:name="Bitstream Vera Sans Mono" fo:font-family="&apos;Bitstream Vera Sans Mono&apos;" style:font-family-generic="modern" style:font-pitch="fixed"/><style:font-decl style:name="Courier" fo:font-family="Courier" style:font-family-generic="modern" style:font-pitch="fixed"/><style:font-decl style:name="Bitstream Vera Sans" fo:font-family="&apos;Bitstream Vera Sans&apos;" style:font-pitch="variable"/><style:font-decl style:name="Lucidasans" fo:font-family="Lucidasans" style:font-pitch="variable"/><style:font-decl style:name="Mincho" fo:font-family="Mincho" style:font-pitch="variable"/><style:font-decl style:name="Times New Roman" fo:font-family="&apos;Times New Roman&apos;" style:font-family-generic="roman" style:font-pitch="variable"/><style:font-decl style:name="Arial" fo:font-family="Arial" style:font-family-generic="swiss" style:font-pitch="variable"/></office:font-decls><office:automatic-styles><style:style style:name="P1" style:family="paragraph" style:parent-style-name="Standard"><style:properties fo:font-weight="bold" style:font-weight-asian="bold" style:font-weight-complex="bold"/></style:style><style:style style:name="P2" style:family="paragraph" style:parent-style-name="Preformatted Text"><style:properties style:font-name="Courier" fo:font-size="9pt" fo:font-weight="bold" style:font-size-asian="9pt" style:font-weight-asian="bold" style:font-size-complex="9pt" style:font-weight-complex="bold"/></style:style><style:style style:name="P3" style:family="paragraph" style:parent-style-name="Preformatted Text"><style:properties style:font-name="Courier" fo:font-size="9pt" style:font-size-asian="9pt" style:font-size-complex="9pt"/></style:style><style:style style:name="T1" style:family="text"><style:properties style:font-name="Courier" fo:font-weight="bold" style:font-weight-asian="bold" style:font-weight-complex="bold"/></style:style><style:style style:name="T2" style:family="text"><style:properties style:font-name="Courier"/></style:style><style:style style:name="T3" style:family="text"><style:properties style:font-name="Courier" fo:font-weight="normal" style:font-weight-asian="normal" style:font-weight-complex="normal"/></style:style><style:style style:name="T4" style:family="text"><style:properties style:font-name="Courier" fo:font-style="italic" style:font-style-asian="italic" style:font-style-complex="italic"/></style:style><style:style style:name="T5" style:family="text"><style:properties fo:font-weight="bold" style:font-weight-asian="bold" style:font-weight-complex="bold"/></style:style><style:style style:name="T6" style:family="text"><style:properties fo:font-style="normal" fo:font-weight="bold" style:font-style-asian="normal" style:font-weight-asian="bold" style:font-style-complex="normal" style:font-weight-complex="bold"/></style:style><style:style style:name="T7" style:family="text"><style:properties fo:font-style="italic" style:font-style-asian="italic" style:font-style-complex="italic"/></style:style></office:automatic-styles><office:body><text:sequence-decls><text:sequence-decl text:display-outline-level="0" text:name="Illustration"/><text:sequence-decl text:display-outline-level="0" text:name="Table"/><text:sequence-decl text:display-outline-level="0" text:name="Text"/><text:sequence-decl text:display-outline-level="0" text:name="Drawing"/></text:sequence-decls><text:p text:style-name="Heading"><text:title/>NAME</text:p><text:p text:style-name="Standard">extract - determine meta-information about a file</text:p><text:p text:style-name="Heading">SYNOPSIS</text:p><text:p text:style-name="Preformatted Text"><text:span text:style-name="T1">extract </text:span><text:span text:style-name="T2">[</text:span><text:span text:style-name="T1">-abdfhLnrsvV</text:span><text:span text:style-name="T2">] </text:span><text:span text:style-name="T3">[</text:span><text:span text:style-name="T1">-B</text:span><text:span text:style-name="T2"> </text:span><text:span text:style-name="T4">language</text:span><text:span text:style-name="T2">][</text:span><text:span text:style-name="T1">-H</text:span><text:span text:style-name="T2"> </text:span><text:span text:style-name="T4">hash-algorithm</text:span><text:span text:style-name="T2">][</text:span><text:span text:style-name="T1">-l</text:span><text:span text:style-name="T2"> </text:span><text:span text:style-name="T4">library</text:span><text:span text:style-name="T2">][</text:span><text:span text:style-name="T1">-p </text:span><text:span text:style-name="T4">type</text:span><text:span text:style-name="T2">]<text:line-break/> <text:s text:c="3"/>[</text:span><text:span text:style-name="T1">-x</text:span><text:span text:style-name="T2"> </text:span><text:span text:style-name="T4">type</text:span><text:span text:style-name="T2">] </text:span><text:span text:style-name="T4">file </text:span><text:span text:style-name="T2">...</text:span></text:p><text:p text:style-name="Heading">DESCRIPTION</text:p><text:p text:style-name="Standard">This manual page documents version 0.4.0 of the <text:span text:style-name="T5">extract</text:span> command.</text:p><text:p text:style-name="Preformatted Text"/><text:p text:style-name="Standard"><text:span text:style-name="T6">extract</text:span> tests each file specified in the argument list in an attempt to infer meta-information from it. <text:s/>Each file is subjected to the meta-data extraction libraries from <text:span text:style-name="T6">libextractor</text:span>. </text:p><text:p text:style-name="Preformatted Text"/><text:p text:style-name="Standard"><text:span text:style-name="T5">libextractor</text:span> classifies meta-information (also referred to as keywords) into types. A list of all types can be obtained with the <text:span text:style-name="T5">-L</text:span> option.</text:p><text:p text:style-name="Heading">OPTIONS</text:p><text:p text:style-name="P1">-a</text:p><text:p text:style-name="First line indent">Do not remove any duplicates, even if the keywords match exactly and have the same type (i.e. because the same keyword was found by different extractor libraries).</text:p><text:p text:style-name="P1">-b</text:p><text:p text:style-name="First line indent">Display the output in BiBTeX format. This implies the <text:span text:style-name="T5">-d</text:span> option.</text:p><text:p text:style-name="Standard"><text:span text:style-name="T5">-B</text:span> <text:span text:style-name="T7">LANG</text:span></text:p><text:p text:style-name="First line indent">Use the generic plaintext extractor for the language with the 2-letter language code <text:span text:style-name="T7">LANG</text:span>. <text:s/>Supported languages are DA (Danish), DE (German), EN (English), ES (Spanish), IT (Italian) and NO (Norwegian).</text:p><text:p text:style-name="P1">-d</text:p><text:p text:style-name="First line indent">Remove duplicates only if the types match exactly. By default, duplicates are removed if the types match or if one of the types is <text:span text:style-name="T7">unknown</text:span> (in this case, the duplicate of unknown type is removed).</text:p><text:p text:style-name="Standard"><text:span text:style-name="T5">-f</text:span></text:p><text:p text:style-name="First line indent">add the filename(s) (without directory) to the list of keywords.</text:p><text:p text:style-name="Standard"><text:span text:style-name="T5">-h</text:span></text:p><text:p text:style-name="First line indent">Print a brief summary of the options.</text:p><text:p text:style-name="Standard"><text:span text:style-name="T5">-H</text:span> <text:span text:style-name="T7">ALGORITHM</text:span></text:p><text:p text:style-name="First line indent">Use the <text:span text:style-name="T7">ALGORITHM</text:span> to compute a hash of each file (possible algorithms are sha1 and md5).</text:p><text:p text:style-name="P1">-L</text:p><text:p text:style-name="First line indent">Print a list of all known keyword types.</text:p><text:p text:style-name="P1">-n</text:p><text:p text:style-name="First line indent">Do not use the default set of extractors (typically all standard extractors, currently mp3, ogg, jpg, gif, png, tiff, real, html, pdf and mime-types), use only the extractors specified with the <text:span text:style-name="T5">-l</text:span> option.</text:p><text:p text:style-name="P1">-r</text:p><text:p text:style-name="First line indent">Remove all duplicates disregarding differences in the keyword type.</text:p><text:p text:style-name="P1">-s</text:p><text:p text:style-name="First line indent">Split keywords at delimiters (space, comma, colon, etc.) and list split keywords to be of <text:s/><text:span text:style-name="T7">unknown</text:span> type. This can also be done by loading the split-library. Using this option guarantees that the splitting is performed after all other libraries have been run. It is always performed before duplicate elimination.</text:p><text:p text:style-name="P1">-v</text:p><text:p text:style-name="First line indent">Print the version number and exit.</text:p><text:p text:style-name="P1">-V</text:p><text:p text:style-name="First line indent">Be verbose.</text:p><text:p text:style-name="P1">-B</text:p><text:p text:style-name="First line indent">Run the printable extractor (costly, generic extractor for binaries)</text:p><text:p text:style-name="Standard"><text:span text:style-name="T5">-l</text:span> <text:span text:style-name="T7">libraries</text:span></text:p><text:p text:style-name="First line indent">Use the specified <text:span text:style-name="T7">libraries</text:span> to extract keywords. The general format of libraries is <text:span text:style-name="T7">[[-LIBRARYNAME[:[-]LIBRARYNAME]*]</text:span> where <text:span text:style-name="T7">LIBRARYNAME</text:span> is a libextractor compatible library and typically of the form <text:span text:style-name="T7">libextractor_jpeg.so</text:span>. The minus before the libraryname indicates that this library should be run after all the libraries that were specified so far. If the minus is missing, the library is run before all previously specified libraries. </text:p><text:p text:style-name="Standard"><text:span text:style-name="T5">-p</text:span> <text:span text:style-name="T7">type</text:span></text:p><text:p text:style-name="First line indent">Print only the keywords matching the specified <text:span text:style-name="T7">type</text:span>. By default, all keywords that are found and not removed as duplicates are printed.</text:p><text:p text:style-name="Standard"><text:span text:style-name="T5">-x</text:span> <text:span text:style-name="T7">type</text:span></text:p><text:p text:style-name="First line indent">Exclude keywords of the specified <text:span text:style-name="T7">type</text:span> from the output. By default, all keywords that are found and not removed as duplicates are printed.</text:p><text:p text:style-name="Heading">SEE ALSO</text:p><text:p text:style-name="Standard">libextractor (3) - description of the libextractor library</text:p><text:p text:style-name="Heading">EXAMPLES</text:p><text:p text:style-name="P2">$ extract test/test.jpg</text:p><text:p text:style-name="P3">comment - (C) 2001 by Christian Grothoff, using gimp 1.2 1</text:p><text:p text:style-name="P3">mimetype - image/jpeg</text:p><text:p text:style-name="P3"/><text:p text:style-name="P2">$ extract -Vf -x comment test/test.jpg</text:p><text:p text:style-name="P3">Keywords for file test/test.jpg:</text:p><text:p text:style-name="P3">mimetype - image/jpeg</text:p><text:p text:style-name="P3">filename - test.jpg</text:p><text:p text:style-name="P3"/><text:p text:style-name="P2">$ extract -p comment test/test.jpg</text:p><text:p text:style-name="P3">comment - (C) 2001 by Christian Grothoff, using gimp 1.2 1</text:p><text:p text:style-name="P3"/><text:p text:style-name="P2">$ extract -nV -l libextractor_png.so -p comment test/test.jpg test/test.png</text:p><text:p text:style-name="P3">Keywords for file test/test.jpg:</text:p><text:p text:style-name="P3">Keywords for file test/test.png:</text:p><text:p text:style-name="P3">comment - Testing keyword extraction</text:p><text:p text:style-name="Heading">LEGAL NOTICE</text:p><text:p text:style-name="Standard">libextractor and the extract tool are released under the GPL.</text:p><text:p text:style-name="Heading">BUGS</text:p><text:p text:style-name="Standard">A couple of file-formats (on the order of 10^3) are not recognized...</text:p><text:p text:style-name="Heading">AUTHORS</text:p><text:p text:style-name="Standard"><text:span text:style-name="T5">extract</text:span> was originally written by Christian Grothoff &lt;christian@grothoff.org&gt; and Vidyut Samanta &lt;vids@cs.ucla.edu&gt;. Use &lt;<text:a xlink:type="simple" xlink:href="mailto:libextractor@cs.purdue.edu">libextractor@cs.purdue.edu</text:a>&gt; to contact the current maintainer(s).</text:p><text:p text:style-name="Heading">AVAILABILITY</text:p><text:p text:style-name="Standard">You can obtain the original author&apos;s latest version from <text:a xlink:type="simple" xlink:href="http://ovmj.org/libextractor/">http://ovmj.org/libextractor/</text:a>.</text:p></office:body></office:document-content> \ No newline at end of file
diff --git a/test/diacritics-msword10-winnt.doc b/test/diacritics-msword10-winnt.doc
new file mode 100644
index 0000000..e0c386c
--- /dev/null
+++ b/test/diacritics-msword10-winnt.doc
Binary files differ
diff --git a/test/extract-msword10-winnt.doc b/test/extract-msword10-winnt.doc
new file mode 100644
index 0000000..3b51a76
--- /dev/null
+++ b/test/extract-msword10-winnt.doc
Binary files differ
diff --git a/test/extract-msword6-msdos.doc b/test/extract-msword6-msdos.doc
new file mode 100644
index 0000000..60375bc
--- /dev/null
+++ b/test/extract-msword6-msdos.doc
Binary files differ
diff --git a/test/extract-msword6-win95.doc b/test/extract-msword6-win95.doc
new file mode 100644
index 0000000..e4839c5
--- /dev/null
+++ b/test/extract-msword6-win95.doc
Binary files differ
diff --git a/test/extract-msword8-winnt.doc b/test/extract-msword8-winnt.doc
new file mode 100644
index 0000000..a9bd191
--- /dev/null
+++ b/test/extract-msword8-winnt.doc
Binary files differ
diff --git a/test/extract-msword97-winnt.rtf b/test/extract-msword97-winnt.rtf
new file mode 100644
index 0000000..790435b
--- /dev/null
+++ b/test/extract-msword97-winnt.rtf
@@ -0,0 +1,116 @@
1{\rtf1\ansi\ansicpg1252\uc1\deff0\stshfdbch0\stshfloch0\stshfhich0\stshfbi0\deflang1036\deflangfe1036{\fonttbl{\f0\froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}{\f1\fswiss\fcharset0\fprq2{\*\panose 020b0604020202020204}Arial;}
2{\f5\fmodern\fcharset0\fprq1{\*\panose 02070409020205020404}Courier;}{\f36\fmodern\fcharset0\fprq1{\*\panose 020b0609030804020204}Bitstream Vera Sans Mono;}{\f37\froman\fcharset238\fprq2 Times New Roman CE;}
3{\f38\froman\fcharset204\fprq2 Times New Roman Cyr;}{\f40\froman\fcharset161\fprq2 Times New Roman Greek;}{\f41\froman\fcharset162\fprq2 Times New Roman Tur;}{\f42\froman\fcharset177\fprq2 Times New Roman (Hebrew);}
4{\f43\froman\fcharset178\fprq2 Times New Roman (Arabic);}{\f44\froman\fcharset186\fprq2 Times New Roman Baltic;}{\f45\froman\fcharset163\fprq2 Times New Roman (Vietnamese);}{\f47\fswiss\fcharset238\fprq2 Arial CE;}
5{\f48\fswiss\fcharset204\fprq2 Arial Cyr;}{\f50\fswiss\fcharset161\fprq2 Arial Greek;}{\f51\fswiss\fcharset162\fprq2 Arial Tur;}{\f52\fswiss\fcharset177\fprq2 Arial (Hebrew);}{\f53\fswiss\fcharset178\fprq2 Arial (Arabic);}
6{\f54\fswiss\fcharset186\fprq2 Arial Baltic;}{\f55\fswiss\fcharset163\fprq2 Arial (Vietnamese);}}{\colortbl;\red0\green0\blue0;\red0\green0\blue255;\red0\green255\blue255;\red0\green255\blue0;\red255\green0\blue255;\red255\green0\blue0;
7\red255\green255\blue0;\red255\green255\blue255;\red0\green0\blue128;\red0\green128\blue128;\red0\green128\blue0;\red128\green0\blue128;\red128\green0\blue0;\red128\green128\blue0;\red128\green128\blue128;\red192\green192\blue192;}{\stylesheet{
8\ql \li0\ri0\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \fs24\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 \snext0 Normal;}{\*\cs10 \additive \ssemihidden Default Paragraph Font;}{\*
9\ts11\tsrowd\trftsWidthB3\trpaddl108\trpaddr108\trpaddfl3\trpaddft3\trpaddfb3\trpaddfr3\trcbpat1\trcfpat1\tscellwidthfts0\tsvertalt\tsbrdrt\tsbrdrl\tsbrdrb\tsbrdrr\tsbrdrdgl\tsbrdrdgr\tsbrdrh\tsbrdrv
10\ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \fs20\lang1024\langfe1024\cgrid\langnp1024\langfenp1024 \snext11 \ssemihidden Normal Table;}{\*\cs15 \additive \ul\cf9 Hyperlink;}{
11\s16\ql \li0\ri0\sa120\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \fs24\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 \sbasedon0 \snext16 Body Text;}{\s17\ql \fi283\li0\ri0\sa120\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0
12\fs24\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 \sbasedon16 \snext17 Body Text First Indent;}{\s18\ql \li0\ri0\sb240\sa120\keepn\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \f1\fs28\lang1036\langfe1036\cgrid\langnp1036\langfenp1036
13\sbasedon0 \snext16 Title;}{\s19\ql \li0\ri0\sa120\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \fs24\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 \sbasedon16 \snext19 List;}{
14\s20\ql \li0\ri0\sb120\sa120\nowidctlpar\noline\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \i\fs20\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 \sbasedon0 \snext20 caption;}{
15\s21\ql \li0\ri0\nowidctlpar\noline\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \fs24\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 \sbasedon0 \snext21 R\'e9pertoire;}{\s22\ql \li0\ri0\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0
16\f36\fs20\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 \sbasedon0 \snext22 Texte pr\'e9format\'e9;}}{\*\rsidtbl \rsid1927825\rsid2909638\rsid9461202\rsid13650188\rsid14032633}{\*\generator Microsoft Word 10.0.2627;}{\info{\title NAME}
17{\author melennec}{\operator melennec}{\creatim\yr2005\mo8\dy9\hr10\min14}{\revtim\yr2005\mo8\dy9\hr10\min14}{\version2}{\edmins3}{\nofpages3}{\nofwords609}{\nofchars3355}{\*\company CENA}{\nofcharsws3957}{\vern16437}}
18\paperw11905\paperh16837\margl1134\margr1134\margt1134\margb1134 \deftab709\widowctrl\ftntj\aenddoc\nospaceforul\lytprtmet\formshade\horzdoc\dghspace120\dgvspace120\dghorigin1701\dgvorigin1984\dghshow1\dgvshow0
19\jexpand\viewkind1\viewscale100\pgbrdrhead\pgbrdrfoot\bdrrlswsix\nolnhtadjtbl\oldas\rsidroot9461202 \donotshowmarkup1\fet0\sectd \linex0\endnhere\sectdefaultcl\sftntj {\*\pnseclvl1\pnucrm\pnstart1\pnindent720\pnhang {\pntxta .}}{\*\pnseclvl2
20\pnucltr\pnstart1\pnindent720\pnhang {\pntxta .}}{\*\pnseclvl3\pndec\pnstart1\pnindent720\pnhang {\pntxta .}}{\*\pnseclvl4\pnlcltr\pnstart1\pnindent720\pnhang {\pntxta )}}{\*\pnseclvl5\pndec\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}{\*\pnseclvl6
21\pnlcltr\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}{\*\pnseclvl7\pnlcrm\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}{\*\pnseclvl8\pnlcltr\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}{\*\pnseclvl9\pnlcrm\pnstart1\pnindent720\pnhang
22{\pntxtb (}{\pntxta )}}\pard\plain \s18\ql \li0\ri0\sb240\sa120\keepn\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \f1\fs28\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\field{\*\fldinst {\insrsid16269393 TITLE}}{\fldrslt }}{
23\insrsid16269393 NAME
24\par }\pard\plain \ql \li0\ri0\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \fs24\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\insrsid16269393 extract - determine meta-information about a file
25\par }\pard\plain \s18\ql \li0\ri0\sb240\sa120\keepn\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \f1\fs28\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\insrsid16269393 SYNOPSIS
26\par }\pard\plain \s22\ql \li0\ri0\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \f36\fs20\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\b\f5\insrsid16269393 extract }{\f5\insrsid16269393 [}{\b\f5\insrsid16269393 -abdfhLnrsvV}{
27\f5\insrsid16269393 ] [}{\b\f5\insrsid16269393 -B}{\f5\insrsid16269393 }{\i\f5\insrsid16269393 language}{\f5\insrsid16269393 ][}{\b\f5\insrsid16269393 -H}{\f5\insrsid16269393 }{\i\f5\insrsid16269393 hash-algorithm}{\f5\insrsid16269393 ][}{
28\b\f5\insrsid16269393 -l}{\f5\insrsid16269393 }{\i\f5\insrsid16269393 library}{\f5\insrsid16269393 ][}{\b\f5\insrsid16269393 -p }{\i\f5\insrsid16269393 type}{\f5\insrsid16269393 ]\line [}{\b\f5\insrsid16269393 -x}{\f5\insrsid16269393 }{
29\i\f5\insrsid16269393 type}{\f5\insrsid16269393 ] }{\i\f5\insrsid16269393 file }{\f5\insrsid16269393 ...
30\par }\pard\plain \s18\ql \li0\ri0\sb240\sa120\keepn\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \f1\fs28\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\insrsid16269393 DESCRIPTION
31\par }\pard\plain \ql \li0\ri0\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \fs24\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\insrsid16269393 This manual page documents version 0.4.0 of the }{\b\insrsid16269393 extract}{\insrsid16269393
32 command.
33\par }\pard\plain \s22\ql \li0\ri0\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \f36\fs20\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\insrsid16269393
34\par }\pard\plain \ql \li0\ri0\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \fs24\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\b\insrsid16269393 extract}{\insrsid16269393
35 tests each file specified in the argument list in an attempt to infer meta-information from it. Each file is subjected to the meta-data extraction libraries from }{\b\insrsid16269393 libextractor}{\insrsid16269393 .
36\par }\pard\plain \s22\ql \li0\ri0\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \f36\fs20\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\insrsid16269393
37\par }\pard\plain \ql \li0\ri0\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \fs24\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\b\insrsid16269393 libextractor}{\insrsid16269393 classifies meta-information (also referred to as keywords) int
38o types. A list of all types can be obtained with the }{\b\insrsid16269393 -L}{\insrsid16269393 option.
39\par }\pard\plain \s18\ql \li0\ri0\sb240\sa120\keepn\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \f1\fs28\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\insrsid16269393 OPTIONS
40\par }\pard\plain \ql \li0\ri0\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \fs24\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\b\insrsid16269393 -a
41\par }\pard\plain \s17\ql \fi283\li0\ri0\sa120\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \fs24\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\insrsid16269393
42Do not remove any duplicates, even if the keywords match exactly and have the same type (i.e. because the same keyword was found by different extractor libraries).
43\par }\pard\plain \ql \li0\ri0\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \fs24\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\b\insrsid16269393 -b
44\par }\pard\plain \s17\ql \fi283\li0\ri0\sa120\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \fs24\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\insrsid16269393 Display the output in BiBTeX format. This implies the }{\b\insrsid16269393 -d}{
45\insrsid16269393 option.
46\par }\pard\plain \ql \li0\ri0\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \fs24\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\b\insrsid16269393 -B}{\insrsid16269393 }{\i\insrsid16269393 LANG
47\par }\pard\plain \s17\ql \fi283\li0\ri0\sa120\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \fs24\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\insrsid16269393
48Use the generic plaintext extractor for the language with the 2-letter language code }{\i\insrsid16269393 LANG}{\insrsid16269393 . Supported languages are DA (Danish), DE (German), EN (English), ES (Spanish), IT (Italian) and NO (Norwegian).
49\par }\pard\plain \ql \li0\ri0\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \fs24\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\b\insrsid16269393 -d
50\par }\pard\plain \s17\ql \fi283\li0\ri0\sa120\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \fs24\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\insrsid16269393
51Remove duplicates only if the types match exactly. By default, duplicates are removed if the types match or if one of the types is }{\i\insrsid16269393 unknown}{\insrsid16269393 (in this case, the duplicate of unknown type is removed).
52\par }\pard\plain \ql \li0\ri0\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \fs24\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\b\insrsid16269393 -f
53\par }\pard\plain \s17\ql \fi283\li0\ri0\sa120\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \fs24\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\insrsid16269393 add the filename(s) (without directory) to the list of keywords.
54\par }\pard\plain \ql \li0\ri0\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \fs24\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\b\insrsid16269393 -h
55\par }\pard\plain \s17\ql \fi283\li0\ri0\sa120\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \fs24\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\insrsid16269393 Print a brief summary of the options.
56\par }\pard\plain \ql \li0\ri0\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \fs24\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\b\insrsid16269393 -H}{\insrsid16269393 }{\i\insrsid16269393 ALGORITHM
57\par }\pard\plain \s17\ql \fi283\li0\ri0\sa120\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \fs24\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\insrsid16269393 Use the }{\i\insrsid16269393 ALGORITHM}{\insrsid16269393
58 to compute a hash of each file (possible algorithms are sha1 and md5).
59\par }\pard\plain \ql \li0\ri0\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \fs24\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\b\insrsid16269393 -L
60\par }\pard\plain \s17\ql \fi283\li0\ri0\sa120\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \fs24\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\insrsid16269393 Print a list of all known keyword types.
61\par }\pard\plain \ql \li0\ri0\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \fs24\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\b\insrsid16269393 -n
62\par }\pard\plain \s17\ql \fi283\li0\ri0\sa120\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \fs24\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\insrsid16269393 Do not use the default set of extractors (typic
63ally all standard extractors, currently mp3, ogg, jpg, gif, png, tiff, real, html, pdf and mime-types), use only the extractors specified with the }{\b\insrsid16269393 -l}{\insrsid16269393 option.
64\par }\pard\plain \ql \li0\ri0\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \fs24\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\b\insrsid16269393 -r
65\par }\pard\plain \s17\ql \fi283\li0\ri0\sa120\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \fs24\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\insrsid16269393 Remove all duplicates disregarding differences in the keyword type.
66\par }\pard\plain \ql \li0\ri0\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \fs24\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\b\insrsid16269393 -s
67\par }\pard\plain \s17\ql \fi283\li0\ri0\sa120\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \fs24\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\insrsid16269393 Split keywords at delimi
68ters (space, comma, colon, etc.) and list split keywords to be of }{\i\insrsid16269393 unknown}{\insrsid16269393
69 type. This can also be done by loading the split-library. Using this option guarantees that the splitting is performed after all other libraries have been run. It is always performed before duplicate elimination.
70\par }\pard\plain \ql \li0\ri0\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \fs24\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\b\insrsid16269393 -v
71\par }\pard\plain \s17\ql \fi283\li0\ri0\sa120\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \fs24\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\insrsid16269393 Print the version number and exit.
72\par }\pard\plain \ql \li0\ri0\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \fs24\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\b\insrsid16269393 -V
73\par }\pard\plain \s17\ql \fi283\li0\ri0\sa120\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \fs24\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\insrsid16269393 Be verbose.
74\par }\pard\plain \ql \li0\ri0\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \fs24\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\b\insrsid16269393 -B
75\par }\pard\plain \s17\ql \fi283\li0\ri0\sa120\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \fs24\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\insrsid16269393 Run the printable extractor (costly, generic extractor for binaries)
76\par }\pard\plain \ql \li0\ri0\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \fs24\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\b\insrsid16269393 -l}{\insrsid16269393 }{\i\insrsid16269393 libraries
77\par }\pard\plain \s17\ql \fi283\li0\ri0\sa120\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \fs24\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\insrsid16269393 Use the specified }{\i\insrsid16269393 libraries}{\insrsid16269393
78 to extract keywords. The general format of libraries is }{\i\insrsid16269393 [[-LIBRARYNAME[:[-]LIBRARYNAME]*]}{\insrsid16269393 where }{\i\insrsid16269393 LIBRARYNAME}{\insrsid16269393 is a libextractor compatible library and typically of the form }{
79\i\insrsid16269393 libextractor_jpeg.so}{\insrsid16269393 . The minus before the libraryname indicates that this library should be run after all the libraries that were specified so
80 far. If the minus is missing, the library is run before all previously specified libraries.
81\par }\pard\plain \ql \li0\ri0\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \fs24\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\b\insrsid16269393 -p}{\insrsid16269393 }{\i\insrsid16269393 type
82\par }\pard\plain \s17\ql \fi283\li0\ri0\sa120\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \fs24\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\insrsid16269393 Print only the keywords matching the specified }{\i\insrsid16269393 type}{
83\insrsid16269393 . By default, all keywords that are found and not removed as duplicates are printed.
84\par }\pard\plain \ql \li0\ri0\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \fs24\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\b\insrsid16269393 -x}{\insrsid16269393 }{\i\insrsid16269393 type
85\par }\pard\plain \s17\ql \fi283\li0\ri0\sa120\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \fs24\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\insrsid16269393 Exclude keywords of the specified }{\i\insrsid16269393 type}{\insrsid16269393
86 from the output. By default, all keywords that are found and not removed as duplicates are printed.
87\par }\pard\plain \s18\ql \li0\ri0\sb240\sa120\keepn\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \f1\fs28\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\insrsid16269393 SEE ALSO
88\par }\pard\plain \ql \li0\ri0\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \fs24\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\insrsid16269393 libextractor (3) - description of the libextractor library
89\par }\pard\plain \s18\ql \li0\ri0\sb240\sa120\keepn\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \f1\fs28\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\insrsid16269393 EXAMPLES
90\par }\pard\plain \s22\ql \li0\ri0\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \f36\fs20\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\b\f5\fs18\insrsid16269393 $ extract test/test.jpg
91\par }{\f5\fs18\insrsid16269393 comment - (C) 2001 by Christian Grothoff, using gimp 1.2 1
92\par mimetype - image/jpeg
93\par
94\par }{\b\f5\fs18\insrsid16269393 $ extract -Vf -x comment test/test.jpg
95\par }{\f5\fs18\insrsid16269393 Keywords for file test/test.jpg:
96\par mimetype - image/jpeg
97\par filename - test.jpg
98\par
99\par }{\b\f5\fs18\insrsid16269393 $ extract -p comment test/test.jpg
100\par }{\f5\fs18\insrsid16269393 comment - (C) 2001 by Christian Grothoff, using gimp 1.2 1
101\par
102\par }{\b\f5\fs18\insrsid16269393 $ extract -nV -l libextractor_png.so -p comment test/test.jpg test/test.png
103\par }{\f5\fs18\insrsid16269393 Keywords for file test/test.jpg:
104\par Keywords for file test/test.png:
105\par comment - Testing keyword extraction
106\par }\pard\plain \s18\ql \li0\ri0\sb240\sa120\keepn\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \f1\fs28\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\insrsid16269393 LEGAL NOTICE
107\par }\pard\plain \ql \li0\ri0\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \fs24\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\insrsid16269393 libextractor and the extract tool are released under the GPL.
108\par }\pard\plain \s18\ql \li0\ri0\sb240\sa120\keepn\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \f1\fs28\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\insrsid16269393 BUGS
109\par }\pard\plain \ql \li0\ri0\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \fs24\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\insrsid16269393 A couple of file-formats (on the order of 10^3) are not recognized...
110\par }\pard\plain \s18\ql \li0\ri0\sb240\sa120\keepn\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \f1\fs28\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\insrsid16269393 AUTHORS
111\par }\pard\plain \ql \li0\ri0\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \fs24\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\b\insrsid16269393 extract}{\insrsid16269393
112 was originally written by Christian Grothoff <christian@grothoff.org> and Vidyut Samanta <vids@cs.ucla.edu>. Use <}{\cs15\ul\cf9\insrsid16269393 libextractor@cs.purdue.edu}{\insrsid16269393 > to contact the current maintainer(s).
113\par }\pard\plain \s18\ql \li0\ri0\sb240\sa120\keepn\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \f1\fs28\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\insrsid16269393 AVAILABILITY
114\par }\pard\plain \ql \li0\ri0\nowidctlpar\hyphpar0\nooverflow\faroman\rin0\lin0\itap0 \fs24\lang1036\langfe1036\cgrid\langnp1036\langfenp1036 {\insrsid16269393 You can obtain the original author's latest version from }{\cs15\ul\cf9\insrsid16269393
115http://ovmj.org/libextractor/}{\insrsid16269393 .
116\par }} \ No newline at end of file
diff --git a/test/extract.pdf b/test/extract.pdf
new file mode 100644
index 0000000..81bdce5
--- /dev/null
+++ b/test/extract.pdf
Binary files differ
diff --git a/test/extract.sxw b/test/extract.sxw
new file mode 100644
index 0000000..9f4444a
--- /dev/null
+++ b/test/extract.sxw
Binary files differ
diff --git a/test/meta.xml b/test/meta.xml
new file mode 100644
index 0000000..c33250e
--- /dev/null
+++ b/test/meta.xml
@@ -0,0 +1,2 @@
1<?xml version="1.0" encoding="UTF-8"?>
2<!DOCTYPE office:document-meta PUBLIC "-//OpenOffice.org//DTD OfficeDocument 1.0//EN" "office.dtd"><office:document-meta xmlns:office="http://openoffice.org/2000/office" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:meta="http://openoffice.org/2000/meta" office:version="1.0"><office:meta><meta:generator>OpenOffice.org 1.1.4 (Linux)</meta:generator><!--645m52(Build:8824)--><meta:creation-date>2005-08-09T09:26:56</meta:creation-date><dc:creator>Ronan Melennec</dc:creator><dc:date>2005-08-09T09:58:12</dc:date><dc:language>fr-FR</dc:language><meta:editing-cycles>1</meta:editing-cycles><meta:editing-duration>PT0S</meta:editing-duration><meta:user-defined meta:name="Info 1"/><meta:user-defined meta:name="Info 2"/><meta:user-defined meta:name="Info 3"/><meta:user-defined meta:name="Info 4"/><meta:document-statistic meta:table-count="0" meta:image-count="0" meta:object-count="0" meta:page-count="2" meta:paragraph-count="72" meta:word-count="621" meta:character-count="3885"/></office:meta></office:document-meta> \ No newline at end of file
diff --git a/test/settings.xml b/test/settings.xml
new file mode 100644
index 0000000..e623b32
--- /dev/null
+++ b/test/settings.xml
@@ -0,0 +1,2 @@
1<?xml version="1.0" encoding="UTF-8"?>
2<!DOCTYPE office:document-settings PUBLIC "-//OpenOffice.org//DTD OfficeDocument 1.0//EN" "office.dtd"><office:document-settings xmlns:office="http://openoffice.org/2000/office" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:config="http://openoffice.org/2001/config" office:version="1.0"><office:settings><config:config-item-set config:name="view-settings"><config:config-item config:name="ViewAreaTop" config:type="int">45201</config:config-item><config:config-item config:name="ViewAreaLeft" config:type="int">2143</config:config-item><config:config-item config:name="ViewAreaWidth" config:type="int">18658</config:config-item><config:config-item config:name="ViewAreaHeight" config:type="int">16709</config:config-item><config:config-item config:name="ShowRedlineChanges" config:type="boolean">false</config:config-item><config:config-item config:name="ShowHeaderWhileBrowsing" config:type="boolean">false</config:config-item><config:config-item config:name="ShowFooterWhileBrowsing" config:type="boolean">false</config:config-item><config:config-item config:name="InBrowseMode" config:type="boolean">false</config:config-item><config:config-item-map-indexed config:name="Views"><config:config-item-map-entry><config:config-item config:name="ViewId" config:type="string">view2</config:config-item><config:config-item config:name="ViewLeft" config:type="int">3306</config:config-item><config:config-item config:name="ViewTop" config:type="int">58055</config:config-item><config:config-item config:name="VisibleLeft" config:type="int">2143</config:config-item><config:config-item config:name="VisibleTop" config:type="int">45201</config:config-item><config:config-item config:name="VisibleRight" config:type="int">20800</config:config-item><config:config-item config:name="VisibleBottom" config:type="int">61909</config:config-item><config:config-item config:name="ZoomType" config:type="short">0</config:config-item><config:config-item config:name="ZoomFactor" config:type="short">100</config:config-item><config:config-item config:name="IsSelectedFrame" config:type="boolean">false</config:config-item></config:config-item-map-entry></config:config-item-map-indexed></config:config-item-set><config:config-item-set config:name="configuration-settings"><config:config-item config:name="AddParaTableSpacing" config:type="boolean">false</config:config-item><config:config-item config:name="PrintReversed" config:type="boolean">false</config:config-item><config:config-item config:name="LinkUpdateMode" config:type="short">1</config:config-item><config:config-item config:name="CharacterCompressionType" config:type="short">0</config:config-item><config:config-item config:name="PrintSingleJobs" config:type="boolean">false</config:config-item><config:config-item config:name="UpdateFromTemplate" config:type="boolean">false</config:config-item><config:config-item config:name="PrintPaperFromSetup" config:type="boolean">false</config:config-item><config:config-item config:name="AddFrameOffsets" config:type="boolean">false</config:config-item><config:config-item config:name="PrintLeftPages" config:type="boolean">true</config:config-item><config:config-item config:name="PrintTables" config:type="boolean">true</config:config-item><config:config-item config:name="ChartAutoUpdate" config:type="boolean">true</config:config-item><config:config-item config:name="PrintControls" config:type="boolean">true</config:config-item><config:config-item config:name="PrinterSetup" config:type="base64Binary">ugL+/0dlbmVyaWMgUHJpbnRlcgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAU0dFTlBSVAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAWAAMAAAIAAAAA//8BAAhSAAAEdAAASm9iRGF0YSAxCnByaW50ZXI9R2VuZXJpYyBQcmludGVyCm9yaWVudGF0aW9uPVBvcnRyYWl0CmNvcGllcz0xCnNjYWxlPTEwMAptYXJnaW5kYWp1c3RtZW50PTAsMCwwLDAKY29sb3JkZXB0aD0yNApwc2xldmVsPTAKY29sb3JkZXZpY2U9MApQUERDb250ZXhEYXRhClBhZ2VTaXplOkE0AABudHMAY2NlcHRzIFBQAAAA2AYAADj8u0UAYZ1AAPy7RQEgaW44AAAAGAAAAPj7u0X4+7tFOyAgICAgICAgIHdpdGggcmVzaWRlbnQgZm9udHMgb3Igbm90APy7RWAAAAAoBwAAmPy7RVD8u0WA/LtFAGJlIEgAAAAYAAAASPy7RUj8u0VQZXJmb3JtRm9udFN1YnN0aXR1dGlvbgCo/LtFiPy7RRgAAAAwAAAAePy7RXj8u0V0cnVlAPy7RRgAAACIBwAAsPy7RQBhnUAAYZ1AAQUAAHAAAACgBwAAIP27RQBhnUDI/LtFAQUAAFgAAAAYAAAAwPy7RcD8u0U7IFN1YnN0Rm9udF88ZG93bmxvYWRhYmxlIGZvbnQ+OiB0aGUgcmVzaWRlbnQgZm9udCB0byByZXBsYWNlAG93bmxvYWRhYmx4AAAAEAgAAIBCvEUAYZ1AOP27RQEFAAA=</config:config-item><config:config-item config:name="PrintAnnotationMode" config:type="short">0</config:config-item><config:config-item config:name="ApplyUserData" config:type="boolean">true</config:config-item><config:config-item config:name="FieldAutoUpdate" config:type="boolean">true</config:config-item><config:config-item config:name="SaveVersionOnClose" config:type="boolean">false</config:config-item><config:config-item config:name="SaveGlobalDocumentLinks" config:type="boolean">false</config:config-item><config:config-item config:name="IsKernAsianPunctuation" config:type="boolean">false</config:config-item><config:config-item config:name="AlignTabStopPosition" config:type="boolean">false</config:config-item><config:config-item config:name="CurrentDatabaseDataSource" config:type="string"/><config:config-item config:name="PrinterName" config:type="string">Generic Printer</config:config-item><config:config-item config:name="PrintFaxName" config:type="string"/><config:config-item config:name="PrintRightPages" config:type="boolean">true</config:config-item><config:config-item config:name="IsLabelDocument" config:type="boolean">false</config:config-item><config:config-item config:name="AddParaTableSpacingAtStart" config:type="boolean">false</config:config-item><config:config-item config:name="PrintProspect" config:type="boolean">false</config:config-item><config:config-item config:name="PrintGraphics" config:type="boolean">true</config:config-item><config:config-item config:name="AllowPrintJobCancel" config:type="boolean">true</config:config-item><config:config-item config:name="CurrentDatabaseCommandType" config:type="int">0</config:config-item><config:config-item config:name="PrinterIndependentLayout" config:type="string">enabled</config:config-item><config:config-item config:name="UseOldNumbering" config:type="boolean">false</config:config-item><config:config-item config:name="PrintPageBackground" config:type="boolean">true</config:config-item><config:config-item config:name="CurrentDatabaseCommand" config:type="string"/><config:config-item config:name="PrintDrawings" config:type="boolean">true</config:config-item><config:config-item config:name="PrintBlackFonts" config:type="boolean">false</config:config-item></config:config-item-set></office:settings></office:document-settings> \ No newline at end of file
diff --git a/test/styles.xml b/test/styles.xml
new file mode 100644
index 0000000..41764dc
--- /dev/null
+++ b/test/styles.xml
@@ -0,0 +1,2 @@
1<?xml version="1.0" encoding="UTF-8"?>
2<!DOCTYPE office:document-styles PUBLIC "-//OpenOffice.org//DTD OfficeDocument 1.0//EN" "office.dtd"><office:document-styles xmlns:office="http://openoffice.org/2000/office" xmlns:style="http://openoffice.org/2000/style" xmlns:text="http://openoffice.org/2000/text" xmlns:table="http://openoffice.org/2000/table" xmlns:draw="http://openoffice.org/2000/drawing" xmlns:fo="http://www.w3.org/1999/XSL/Format" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:number="http://openoffice.org/2000/datastyle" xmlns:svg="http://www.w3.org/2000/svg" xmlns:chart="http://openoffice.org/2000/chart" xmlns:dr3d="http://openoffice.org/2000/dr3d" xmlns:math="http://www.w3.org/1998/Math/MathML" xmlns:form="http://openoffice.org/2000/form" xmlns:script="http://openoffice.org/2000/script" office:version="1.0"><office:font-decls><style:font-decl style:name="Lucidasans1" fo:font-family="Lucidasans"/><style:font-decl style:name="Bitstream Vera Sans Mono" fo:font-family="&apos;Bitstream Vera Sans Mono&apos;" style:font-family-generic="modern" style:font-pitch="fixed"/><style:font-decl style:name="Courier" fo:font-family="Courier" style:font-family-generic="modern" style:font-pitch="fixed"/><style:font-decl style:name="Bitstream Vera Sans" fo:font-family="&apos;Bitstream Vera Sans&apos;" style:font-pitch="variable"/><style:font-decl style:name="Lucidasans" fo:font-family="Lucidasans" style:font-pitch="variable"/><style:font-decl style:name="Mincho" fo:font-family="Mincho" style:font-pitch="variable"/><style:font-decl style:name="Times New Roman" fo:font-family="&apos;Times New Roman&apos;" style:font-family-generic="roman" style:font-pitch="variable"/><style:font-decl style:name="Arial" fo:font-family="Arial" style:font-family-generic="swiss" style:font-pitch="variable"/></office:font-decls><office:styles><style:default-style style:family="graphics"><style:properties draw:start-line-spacing-horizontal="0.283cm" draw:start-line-spacing-vertical="0.283cm" draw:end-line-spacing-horizontal="0.283cm" draw:end-line-spacing-vertical="0.283cm" style:use-window-font-color="true" style:font-name="Times New Roman" fo:font-size="12pt" fo:language="fr" fo:country="FR" style:font-name-asian="Bitstream Vera Sans" style:font-size-asian="12pt" style:language-asian="none" style:country-asian="none" style:font-name-complex="Lucidasans" style:font-size-complex="12pt" style:language-complex="none" style:country-complex="none" style:text-autospace="ideograph-alpha" style:line-break="strict" style:writing-mode="lr-tb"><style:tab-stops/></style:properties></style:default-style><style:default-style style:family="paragraph"><style:properties style:use-window-font-color="true" style:font-name="Times New Roman" fo:font-size="12pt" fo:language="fr" fo:country="FR" style:font-name-asian="Bitstream Vera Sans" style:font-size-asian="12pt" style:language-asian="none" style:country-asian="none" style:font-name-complex="Lucidasans" style:font-size-complex="12pt" style:language-complex="none" style:country-complex="none" fo:hyphenate="false" fo:hyphenation-remain-char-count="2" fo:hyphenation-push-char-count="2" fo:hyphenation-ladder-count="no-limit" style:text-autospace="ideograph-alpha" style:punctuation-wrap="hanging" style:line-break="strict" style:tab-stop-distance="1.251cm" style:writing-mode="page"/></style:default-style><style:style style:name="Standard" style:family="paragraph" style:class="text"/><style:style style:name="Text body" style:family="paragraph" style:parent-style-name="Standard" style:class="text"><style:properties fo:margin-top="0cm" fo:margin-bottom="0.212cm"/></style:style><style:style style:name="First line indent" style:family="paragraph" style:parent-style-name="Text body" style:class="text"><style:properties fo:margin-left="0cm" fo:margin-right="0cm" fo:text-indent="0.499cm" style:auto-text-indent="false"/></style:style><style:style style:name="Heading" style:family="paragraph" style:parent-style-name="Standard" style:next-style-name="Text body" style:class="text"><style:properties fo:margin-top="0.423cm" fo:margin-bottom="0.212cm" style:font-name="Arial" fo:font-size="14pt" style:font-name-asian="Mincho" style:font-size-asian="14pt" style:font-name-complex="Lucidasans" style:font-size-complex="14pt" fo:keep-with-next="true"/></style:style><style:style style:name="List" style:family="paragraph" style:parent-style-name="Text body" style:class="list"><style:properties style:font-name-complex="Lucidasans1"/></style:style><style:style style:name="Caption" style:family="paragraph" style:parent-style-name="Standard" style:class="extra"><style:properties fo:margin-top="0.212cm" fo:margin-bottom="0.212cm" fo:font-size="10pt" fo:font-style="italic" style:font-size-asian="10pt" style:font-style-asian="italic" style:font-name-complex="Lucidasans1" style:font-size-complex="10pt" style:font-style-complex="italic" text:number-lines="false" text:line-number="0"/></style:style><style:style style:name="Index" style:family="paragraph" style:parent-style-name="Standard" style:class="index"><style:properties style:font-name-complex="Lucidasans1" text:number-lines="false" text:line-number="0"/></style:style><style:style style:name="Preformatted Text" style:family="paragraph" style:parent-style-name="Standard" style:class="html"><style:properties fo:margin-top="0cm" fo:margin-bottom="0cm" style:font-name="Bitstream Vera Sans Mono" fo:font-size="10pt" style:font-name-asian="Bitstream Vera Sans Mono" style:font-size-asian="10pt" style:font-name-complex="Bitstream Vera Sans Mono" style:font-size-complex="10pt"/></style:style><style:style style:name="Internet link" style:family="text"><style:properties fo:color="#000080" style:text-underline="single" style:text-underline-color="font-color"/></style:style><text:outline-style><text:outline-level-style text:level="1" style:num-format=""/><text:outline-level-style text:level="2" style:num-format=""/><text:outline-level-style text:level="3" style:num-format=""/><text:outline-level-style text:level="4" style:num-format=""/><text:outline-level-style text:level="5" style:num-format=""/><text:outline-level-style text:level="6" style:num-format=""/><text:outline-level-style text:level="7" style:num-format=""/><text:outline-level-style text:level="8" style:num-format=""/><text:outline-level-style text:level="9" style:num-format=""/><text:outline-level-style text:level="10" style:num-format=""/></text:outline-style><text:footnotes-configuration style:num-format="1" text:start-value="0" text:footnotes-position="page" text:start-numbering-at="document"/><text:endnotes-configuration style:num-format="i" text:start-value="0"/><text:linenumbering-configuration text:number-lines="false" text:offset="0.499cm" style:num-format="1" text:number-position="left" text:increment="5"/></office:styles><office:automatic-styles><style:page-master style:name="pm1"><style:properties fo:page-width="20.999cm" fo:page-height="29.699cm" style:num-format="1" style:print-orientation="portrait" fo:margin-top="2cm" fo:margin-bottom="2cm" fo:margin-left="2cm" fo:margin-right="2cm" style:writing-mode="lr-tb" style:footnote-max-height="0cm"><style:footnote-sep style:width="0.018cm" style:distance-before-sep="0.101cm" style:distance-after-sep="0.101cm" style:adjustment="left" style:rel-width="25%" style:color="#000000"/></style:properties><style:header-style/><style:footer-style/></style:page-master></office:automatic-styles><office:master-styles><style:master-page style:name="Standard" style:page-master-name="pm1"/></office:master-styles></office:document-styles> \ No newline at end of file