Extractor.cs (11565B)
1 // Extractor.cs 2 // 3 // Copyright (C) 2008, 2009 Patrick Ulbrich, zulu99@gmx.net 4 // 5 // This program is free software: you can redistribute it and/or modify 6 // it under the terms of the GNU General Public License as published by 7 // the Free Software Foundation, either version 3 of the License, or 8 // (at your option) any later version. 9 // 10 // This program is distributed in the hope that it will be useful, 11 // but WITHOUT ANY WARRANTY; without even the implied warranty of 12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 // GNU General Public License for more details. 14 // 15 // You should have received a copy of the GNU General Public License 16 // along with this program. If not, see <http://www.gnu.org/licenses/>. 17 // 18 19 // NOTE: 20 // 21 // The following functions have been implemented directly (based on the libextractor original code) 22 // as a pinvoke call into the native library would involve a complicated conversion 23 // of the managed Keyword[] array into a unmanaged linked list. 24 // On top of that the native library would also try to free that list :-(. 25 // The code of those functions is so simple that it isn't worth it anyway... 26 // 27 // EXTRACTOR_KeywordList * EXTRACTOR_removeDuplicateKeywords(EXTRACTOR_KeywordList * list, unsigned int options); 28 // EXTRACTOR_KeywordList * EXTRACTOR_removeEmptyKeywords (EXTRACTOR_KeywordList * list); 29 // EXTRACTOR_KeywordList * EXTRACTOR_removeKeywordsOfType(EXTRACTOR_KeywordList * list, EXTRACTOR_KeywordType type); 30 // const char * EXTRACTOR_extractLast(EXTRACTOR_KeywordType type, EXTRACTOR_KeywordList * keywords); 31 // const char * EXTRACTOR_extractLastByString(const char * type, EXTRACTOR_KeywordList * keywords); 32 33 using System; 34 using System.Collections.Generic; 35 using System.Runtime.InteropServices; 36 37 namespace LibExtractor 38 { 39 public class Extractor : IDisposable 40 { 41 private IntPtr pExtractors; 42 private bool disposed; 43 44 public Extractor() { 45 disposed = false; 46 pExtractors = IntPtr.Zero; 47 } 48 49 ~Extractor() { 50 Dispose(false); 51 } 52 53 /// 54 /// Instance members 55 /// 56 public void LoadDefaultLibraries() { 57 EnsureNotDisposed(); 58 if (pExtractors != IntPtr.Zero) 59 RemoveAllLibraries(); 60 61 pExtractors = EXTRACTOR_loadDefaultLibraries(); 62 } 63 64 public void LoadConfigLibraries(string config) { 65 EnsureNotDisposed(); 66 EnsureValidStringParam(config, "config"); 67 // prev parameter may be null, so don't test for loaded extractors. 68 pExtractors = EXTRACTOR_loadConfigLibraries(pExtractors, config); 69 } 70 71 public void AddLibrary(string library) { 72 EnsureNotDisposed(); 73 EnsureValidStringParam(library, "library"); 74 // prev parameter may be null, so don't test for loaded extractors. 75 pExtractors = EXTRACTOR_addLibrary(pExtractors, library); 76 } 77 78 public void AddLibraryLast(string library) { 79 EnsureNotDisposed(); 80 EnsureValidStringParam(library, "library"); 81 // prev parameter may be null, so don't test for loaded extractors. 82 pExtractors = EXTRACTOR_addLibraryLast(pExtractors, library); 83 } 84 85 public void RemoveLibrary(string library) { 86 EnsureNotDisposed(); 87 EnsureValidStringParam(library, "library"); 88 // prev parameter may be null, so don't test for loaded extractors. 89 pExtractors = EXTRACTOR_removeLibrary(pExtractors, library); 90 } 91 92 public void RemoveAllLibraries() { 93 EnsureNotDisposed(); 94 EXTRACTOR_removeAll(pExtractors); 95 pExtractors = IntPtr.Zero; 96 } 97 98 public Keyword[] GetKeywords(string filename) { 99 EnsureNotDisposed(); 100 EnsureValidStringParam(filename, "filename"); 101 EnsureExtractors(); 102 103 List<Keyword> list = GetKeywordsInternal(EXTRACTOR_getKeywords(pExtractors, filename)); 104 return list.ToArray(); 105 } 106 107 public Keyword[] GetKeywords(IntPtr data, int size) { 108 EnsureNotDisposed(); 109 EnsureExtractors(); 110 111 if (data == IntPtr.Zero) 112 throw new ArgumentException("Data must not be a null pointer", "data"); 113 if (size <= 0) 114 throw new ArgumentException("Size must be greater than 0", "size"); 115 116 List<Keyword> list = GetKeywordsInternal(EXTRACTOR_getKeywords2(pExtractors, data, size)); 117 return list.ToArray(); 118 } 119 120 public Keyword[] GetKeywords(byte[] data) { 121 EnsureNotDisposed(); 122 EnsureExtractors(); 123 124 if (data == null) 125 throw new ArgumentNullException("data"); 126 if (data.Length == 0) 127 throw new ArgumentException("Length of data must be greater than 0", "data"); 128 129 IntPtr pMem = IntPtr.Zero; 130 try { 131 pMem = Marshal.AllocHGlobal(data.Length); 132 Marshal.Copy(data, 0, pMem, data.Length); 133 return GetKeywords(pMem, data.Length); 134 } finally { 135 if (pMem != IntPtr.Zero) 136 Marshal.FreeHGlobal(pMem); 137 } 138 } 139 140 private List<Keyword> GetKeywordsInternal(IntPtr pKeywords) { 141 try { 142 List<Keyword> list = new List<Keyword>(); 143 while(pKeywords != IntPtr.Zero) { 144 Keyword k = (Keyword)Marshal.PtrToStructure(pKeywords, typeof(Keyword)); 145 list.Add(k); 146 pKeywords = k.next; 147 } 148 return list; 149 } finally { 150 if (pKeywords != IntPtr.Zero) 151 EXTRACTOR_freeKeywords(pKeywords); 152 } 153 } 154 155 /// 156 /// Static members 157 /// 158 159 // Returns an Extractor instance with the default library set loaded. 160 public static Extractor GetDefault() { 161 Extractor e = new Extractor(); 162 e.LoadDefaultLibraries(); 163 return e; 164 } 165 166 public static string GetKeywordTypeAsString(KeywordType type) { 167 // NOTE : string does NOT need to be freed. 168 IntPtr pStr = EXTRACTOR_getKeywordTypeAsString(type); 169 string str = Marshal.PtrToStringAnsi(pStr); 170 return str; 171 } 172 173 public static KeywordType GetHighestKeywordTypeNumber() { 174 return EXTRACTOR_getHighestKeywordTypeNumber(); 175 } 176 177 public static Keyword[] RemoveDuplicateKeywords(Keyword[] keywords, DuplicateOptions options) { 178 int removed = 0; 179 180 for (int i = 0; i < keywords.Length; i++) { 181 Keyword current = keywords[i]; 182 183 if (current == null) 184 continue; 185 186 KeywordType type = current.keywordType; 187 string keyword = current.keyword; 188 189 for (int j = 0; j < keywords.Length; j++) { 190 Keyword pos = keywords[j]; 191 192 if ((i == j) || (pos == null)) 193 continue; 194 195 if ( (pos.keyword == keyword) && 196 ( (pos.keywordType == type) || 197 ( ((options & DuplicateOptions.DUPLICATES_TYPELESS) > 0) && 198 ( (pos.keywordType == KeywordType.EXTRACTOR_SPLIT) || 199 (type != KeywordType.EXTRACTOR_SPLIT)) ) || 200 ( ((options & DuplicateOptions.DUPLICATES_REMOVE_UNKNOWN) > 0) && 201 (pos.keywordType == KeywordType.EXTRACTOR_UNKNOWN)) ) ) { 202 203 if (removed == 0) { 204 // do not modify the original array. 205 // lazy copy - copy only if the array 206 // will be modified. 207 keywords = CopyKeywords(keywords); 208 } 209 210 // mark keyword as removed. 211 keywords[j] = null; 212 removed++; 213 } 214 } 215 } 216 217 return RemoveNullKeywords(keywords, removed); 218 } 219 220 public static Keyword[] RemoveEmptyKeywords(Keyword[] keywords) { 221 List<Keyword> lst = null; 222 223 for (int i = 0; i < keywords.Length; i++) { 224 Keyword pos = keywords[i]; 225 string keyword = pos.keyword; 226 bool allWhite = true; 227 228 for (int j = 0; j < keyword.Length; j++) { 229 if (!char.IsWhiteSpace(keyword[j])) { 230 allWhite = false; 231 break; 232 } 233 } 234 235 if (allWhite) { 236 if (lst == null) { 237 // lazy copy - 238 // copy only if keywords will actually be removed. 239 lst = new List<Keyword>(keywords.Length); 240 lst.AddRange(keywords); 241 } 242 lst.Remove(pos); 243 } 244 } 245 246 if (lst == null) 247 return keywords; 248 else 249 return lst.ToArray(); 250 } 251 252 public static Keyword[] RemoveKeywordsOfType(Keyword[] keywords, KeywordType type) { 253 List<Keyword> lst = null; 254 255 for (int i = 0; i < keywords.Length; i++) { 256 Keyword pos = keywords[i]; 257 258 if (pos.keywordType == type) { 259 if (lst == null) { 260 // lazy copy - 261 // copy only if keywords will actually be removed. 262 lst = new List<Keyword>(keywords.Length); 263 lst.AddRange(keywords); 264 } 265 lst.Remove(pos); 266 } 267 } 268 269 if (lst == null) 270 return keywords; 271 else 272 return lst.ToArray(); 273 } 274 275 public static string ExtractLast(KeywordType type, Keyword[] keywords) { 276 string result = null; 277 for (int i = 0; i < keywords.Length; i++) { 278 Keyword pos = keywords[i]; 279 if (pos.keywordType == type) { 280 result = pos.keyword; 281 } 282 } 283 return result; 284 } 285 286 // NOTE : does not work with translated strings. 287 public static string ExtractLastByString(string type, Keyword[] keywords) { 288 string result = null; 289 for (int i = 0; i < keywords.Length; i++) { 290 Keyword pos = keywords[i]; 291 if (GetKeywordTypeAsString(pos.keywordType) == type) { 292 result = pos.keyword; 293 } 294 } 295 return result; 296 } 297 298 private static Keyword[] CopyKeywords(Keyword[] original) { 299 Keyword[] copy = new Keyword[original.Length]; 300 Array.Copy(original, copy, original.Length); 301 return copy; 302 } 303 304 private static Keyword[] RemoveNullKeywords(Keyword[] keywords, int nullCount) { 305 if (nullCount < 1) 306 return keywords; 307 308 Keyword[] copy = new Keyword[keywords.Length - nullCount]; 309 int n = 0; 310 311 for (int i = 0; i < keywords.Length; i++) { 312 if (keywords[i] != null) 313 copy[n++] = keywords[i]; 314 } 315 316 return copy; 317 } 318 319 /// 320 /// Cleanup stuff 321 /// 322 323 #region IDisposable members 324 public void Dispose() { 325 Dispose(true); 326 } 327 328 private void Dispose(bool disposing) { 329 if (!disposed) { 330 if (pExtractors != IntPtr.Zero) 331 RemoveAllLibraries(); 332 333 if (disposing) 334 GC.SuppressFinalize(this); 335 336 disposed = true; 337 } 338 } 339 #endregion 340 341 /// 342 /// Helper methods 343 /// 344 345 private void EnsureNotDisposed() { 346 if (disposed) 347 throw new ObjectDisposedException("Extractor"); 348 } 349 350 private void EnsureExtractors() { 351 if (pExtractors == IntPtr.Zero) 352 throw new InvalidOperationException("No extractor libraries loaded"); 353 } 354 355 private void EnsureValidStringParam(string param, string paramName) { 356 if (param == null) 357 throw new ArgumentNullException(paramName); 358 if (param.Length == 0) 359 throw new ArgumentException("Parameter must not be null", paramName); 360 } 361 362 /// 363 /// Native libextractor imports 364 /// 365 366 #region Native imports 367 [DllImport("libextractor")] 368 private static extern IntPtr EXTRACTOR_loadDefaultLibraries(); 369 370 [DllImport("libextractor")] 371 private static extern IntPtr EXTRACTOR_loadConfigLibraries(IntPtr prev, string config); 372 373 [DllImport("libextractor")] 374 private static extern IntPtr EXTRACTOR_addLibrary(IntPtr prev, string library); 375 376 [DllImport("libextractor")] 377 private static extern IntPtr EXTRACTOR_addLibraryLast(IntPtr prev, string library); 378 379 [DllImport("libextractor")] 380 private static extern IntPtr EXTRACTOR_removeLibrary(IntPtr prev, string library); 381 382 [DllImport("libextractor")] 383 private static extern void EXTRACTOR_removeAll(IntPtr libraries); 384 385 [DllImport("libextractor")] 386 private static extern IntPtr EXTRACTOR_getKeywords(IntPtr extractors, string filename); 387 388 [DllImport("libextractor")] 389 private static extern IntPtr EXTRACTOR_getKeywords2(IntPtr extractors, IntPtr data, int size); 390 391 [DllImport("libextractor")] 392 private static extern void EXTRACTOR_freeKeywords(IntPtr keywords); 393 394 [DllImport("libextractor")] 395 private static extern IntPtr EXTRACTOR_getKeywordTypeAsString(KeywordType type); 396 397 [DllImport("libextractor")] 398 private static extern KeywordType EXTRACTOR_getHighestKeywordTypeNumber(); 399 #endregion 400 } 401 }