diff -u -d -r -P abi/src/wp/impexp/xp.orig/ie_imp.cpp abi/src/wp/impexp/xp/ie_imp.cpp --- abi/src/wp/impexp/xp.orig/ie_imp.cpp Wed Jan 26 16:22:45 2000 +++ abi/src/wp/impexp/xp/ie_imp.cpp Mon Feb 7 11:13:03 2000 @@ -38,6 +38,8 @@ struct _imp { + UT_Bool (*fpRecognizeContents)(const char * szBuf, + int iNumbytes); UT_Bool (*fpRecognizeSuffix)(const char * szSuffix); UT_Error (*fpStaticConstructor)(PD_Document * pDocument, IE_Imp ** ppie); @@ -47,16 +49,16 @@ UT_Bool (*fpSupportsFileType)(IEFileType ft); }; -#define DeclareImporter(n) { n::RecognizeSuffix, n::StaticConstructor, n::GetDlgLabels, n::SupportsFileType } +#define DeclareImporter(n) { n::RecognizeContents, n::RecognizeSuffix, n::StaticConstructor, n::GetDlgLabels, n::SupportsFileType } static struct _imp s_impTable[] = { DeclareImporter(IE_Imp_AbiWord_1), DeclareImporter(IE_Imp_GZipAbiWord), - DeclareImporter(IE_Imp_Text), DeclareImporter(IE_Imp_RTF), DeclareImporter(IE_Imp_MsWord_97), DeclareImporter(IE_Imp_UTF8), + DeclareImporter(IE_Imp_Text), }; @@ -75,10 +77,38 @@ /*****************************************************************/ /*****************************************************************/ +IEFileType IE_Imp::fileTypeForContents(const char * szBuf, int iNumbytes) +{ + // we have to construct the loop this way because a + // given filter could support more than one file type, + // so we must query a match for all file types + for (UT_uint32 k=0; (k < NrElements(s_impTable)); k++) + { + struct _imp * s = &s_impTable[k]; + if (s->fpRecognizeContents(szBuf, iNumbytes)) + { + for (UT_uint32 a = 0; a < (int) IEFT_LAST_BOGUS; a++) + { + if (s->fpSupportsFileType((IEFileType) a)) + return (IEFileType) a; + } + + UT_ASSERT(UT_SHOULD_NOT_HAPPEN); + // Hm... an importer recognizes the given data + // but refuses to support any file type we request. + return IEFT_Unknown; + } + } + + // No filter recognizes this data + return IEFT_Unknown; + +} + IEFileType IE_Imp::fileTypeForSuffix(const char * szSuffix) { if (!szSuffix) - return IEFT_Text; + return IEFT_Unknown; // we have to construct the loop this way because a // given filter could support more than one file type, @@ -96,14 +126,13 @@ UT_ASSERT(UT_SHOULD_NOT_HAPPEN); // Hm... an importer has registered for the given suffix, - // bug refuses to support any file type we request. - // Default to Text. - return IEFT_Text; + // but refuses to support any file type we request. + return IEFT_Unknown; } } - // No filter is registered for that extension, try Text for import - return IEFT_Text; + // No filter is registered for that extension + return IEFT_Unknown; } @@ -120,12 +149,30 @@ UT_ASSERT(szFilename && *szFilename); UT_ASSERT(ppie); - // no filter will support IEFT_Unknown, so we detect from the - // suffix of the filename, the real importer to use and assign - // that back to ieft. + // no filter will support IEFT_Unknown, so we try to detect + // from the contents of the file or the filename suffix + // the importer to use and assign that back to ieft. + // Give precedence to the file suffix. if (ieft == IEFT_Unknown) { ieft = IE_Imp::fileTypeForSuffix(UT_pathSuffix(szFilename)); + } + if (ieft == IEFT_Unknown) + { + char szBuf[4096]; // 4096 ought to be enough + int iNumbytes; + FILE *f; + if ( ( f = fopen( szFilename, "r" ) ) != (FILE *)0 ) + { + iNumbytes = fread(szBuf, 1, sizeof(szBuf), f); + fclose(f); + ieft = IE_Imp::fileTypeForContents(szBuf, iNumbytes); + } + } + // as a last resort, just try importing it as text :( + if (ieft == IEFT_Unknown) + { + ieft = IEFT_Text ; } UT_ASSERT(ieft != IEFT_Unknown); diff -u -d -r -P abi/src/wp/impexp/xp.orig/ie_imp.h abi/src/wp/impexp/xp/ie_imp.h --- abi/src/wp/impexp/xp.orig/ie_imp.h Wed Jan 26 16:22:45 2000 +++ abi/src/wp/impexp/xp/ie_imp.h Mon Feb 7 10:33:52 2000 @@ -38,6 +38,9 @@ // responsible for destroying the importer when finished // with it. + static IEFileType fileTypeForContents(const char * szBuf, + int iNumbytes); + static IEFileType fileTypeForSuffix(const char * szSuffix); static UT_Error constructImporter(PD_Document * pDocument, diff -u -d -r -P abi/src/wp/impexp/xp.orig/ie_imp_AbiWord_1.cpp abi/src/wp/impexp/xp/ie_imp_AbiWord_1.cpp --- abi/src/wp/impexp/xp.orig/ie_imp_AbiWord_1.cpp Wed Jan 26 16:22:45 2000 +++ abi/src/wp/impexp/xp/ie_imp_AbiWord_1.cpp Mon Feb 7 11:15:21 2000 @@ -20,6 +20,7 @@ #include #include +#include #include "ut_types.h" #include "ut_assert.h" #include "ut_debugmsg.h" @@ -144,6 +145,40 @@ /*****************************************************************/ /*****************************************************************/ + +UT_Bool IE_Imp_AbiWord_1::RecognizeContents(const char * szBuf, int iNumbytes) +{ + int iLinesToRead = 6 ; // Only examine the first few lines of the file + int iBytesScanned = 0 ; + const char *p ; + char *magic ; + p = szBuf ; + while( iLinesToRead-- ) + { + magic = "= iNumbytes ) return(UT_FALSE); + } + /* Seek past the next newline: */ + if ( *p == '\n' || *p == '\r' ) + { + iBytesScanned++ ; p++ ; + if ( *p == '\n' || *p == '\r' ) + { + iBytesScanned++ ; p++ ; + } + } + } + return(UT_FALSE); +} UT_Bool IE_Imp_AbiWord_1::RecognizeSuffix(const char * szSuffix) { diff -u -d -r -P abi/src/wp/impexp/xp.orig/ie_imp_AbiWord_1.h abi/src/wp/impexp/xp/ie_imp_AbiWord_1.h --- abi/src/wp/impexp/xp.orig/ie_imp_AbiWord_1.h Wed Jan 26 16:22:45 2000 +++ abi/src/wp/impexp/xp/ie_imp_AbiWord_1.h Mon Feb 7 10:33:52 2000 @@ -48,6 +48,7 @@ void _endElement(const XML_Char *name); void _charData(const XML_Char*, int); + static UT_Bool RecognizeContents(const char * szBuf, int iNumbytes); static UT_Bool RecognizeSuffix(const char * szSuffix); static UT_Error StaticConstructor(PD_Document * pDocument, IE_Imp ** ppie); diff -u -d -r -P abi/src/wp/impexp/xp.orig/ie_imp_GZipAbiWord.cpp abi/src/wp/impexp/xp/ie_imp_GZipAbiWord.cpp --- abi/src/wp/impexp/xp.orig/ie_imp_GZipAbiWord.cpp Wed Jan 26 16:22:45 2000 +++ abi/src/wp/impexp/xp/ie_imp_GZipAbiWord.cpp Mon Feb 7 10:43:03 2000 @@ -54,6 +54,23 @@ /*****************************************************************/ /*****************************************************************/ +UT_Bool IE_Imp_GZipAbiWord::RecognizeContents(const char * szBuf, int iNumbytes) +{ + // TODO: This is a hack. Since we're just passed in some + // TODO: some data, and not the actual filename, there isn't + // TODO: much we can do other than verify that it is gzip'ed + // TODO: data. For the time being, assume that if it is + // TODO: gzip'ed, it's gzip'ed abiword. This assumption will + // TODO: be false if and when we support any other compressed + // TODO: formats. + if ( iNumbytes < 2 ) return(UT_FALSE); + if ( ( szBuf[0] == (char)0x1f ) && ( szBuf[1] == (char)0x8b ) ) + { + return(UT_TRUE); + } + return(UT_FALSE); +} + UT_Bool IE_Imp_GZipAbiWord::RecognizeSuffix(const char * szSuffix) { return (UT_stricmp(szSuffix,".zabw") == 0); diff -u -d -r -P abi/src/wp/impexp/xp.orig/ie_imp_GZipAbiWord.h abi/src/wp/impexp/xp/ie_imp_GZipAbiWord.h --- abi/src/wp/impexp/xp.orig/ie_imp_GZipAbiWord.h Wed Jan 26 16:22:45 2000 +++ abi/src/wp/impexp/xp/ie_imp_GZipAbiWord.h Mon Feb 7 10:33:52 2000 @@ -36,6 +36,7 @@ virtual void pasteFromBuffer(PD_DocumentRange * pDocRange, unsigned char * pData, UT_uint32 lenData); + static UT_Bool RecognizeContents(const char * szBuf, int iNumbytes); static UT_Bool RecognizeSuffix(const char * szSuffix); static UT_Error StaticConstructor(PD_Document * pDocument, IE_Imp ** ppie); diff -u -d -r -P abi/src/wp/impexp/xp.orig/ie_imp_MsWord_97.cpp abi/src/wp/impexp/xp/ie_imp_MsWord_97.cpp --- abi/src/wp/impexp/xp.orig/ie_imp_MsWord_97.cpp Sat Feb 5 19:49:37 2000 +++ abi/src/wp/impexp/xp/ie_imp_MsWord_97.cpp Mon Feb 7 11:02:03 2000 @@ -681,6 +681,82 @@ /*****************************************************************/ /*****************************************************************/ +UT_Bool IE_Imp_MsWord_97::RecognizeContents(const char * szBuf, int iNumbytes) +{ + // TODO: This is rather crude, because we don't parse OLE files. + // TODO: For the time being, we assume that any OLE file is an + // TODO: msword document. + // TODO: Caolan is gonna kill me for this. :) + // Most of the magic numbers here were taken from the public domain + // /etc/magic file distributed with the file(1) command written + // by Ian F. Darwin, with contributions and magic entries from + // Rob McMahon, Guy Harris, Christos Zoulas , + // Mark Moraes , and Pawel Wiecek. + char *magic ; + int magicoffset ; + magic = "Microsoft Word 6.0 Document" ; + magicoffset = 2080 ; + if ( iNumbytes > magicoffset+strlen(magic) ) + { + if ( strncmp(szBuf+magicoffset, magic, strlen(magic)) == 0 ) + { + return(UT_TRUE); + } + } + magic = "Documento Microsoft Word 6" ; + magicoffset = 2080 ; + if ( iNumbytes > magicoffset+strlen(magic) ) + { + if ( strncmp(szBuf+magicoffset, magic, strlen(magic)) == 0 ) + { + return(UT_TRUE); + } + } + magic = "MSWordDoc" ; + magicoffset = 2112 ; + if ( iNumbytes > magicoffset+strlen(magic) ) + { + if ( strncmp(szBuf+magicoffset, magic, strlen(magic)) == 0 ) + { + return(UT_TRUE); + } + } + if ( iNumbytes > 8 ) + { + if ( szBuf[0] == (char)0x31 && szBuf[1] == (char)0xbe && + szBuf[2] == (char)0 && szBuf[3] == (char)0 ) + { + return(UT_TRUE); + } + if ( szBuf[0] == 'P' && szBuf[1] == 'O' && + szBuf[2] == '^' && szBuf[3] == 'Q' && szBuf[4] == '`' ) + { + return(UT_TRUE); + } + if ( szBuf[0] == (char)0xfe && szBuf[1] == (char)0x37 && + szBuf[2] == (char)0 && szBuf[3] == (char)0x23 ) + { + return(UT_TRUE); + } + // OLE magic: + // TODO: Dig through the OLE file + if ( szBuf[0] == (char)0xd0 && szBuf[1] == (char)0xcf && + szBuf[2] == (char)0x11 && szBuf[3] == (char)0xe0 && + szBuf[4] == (char)0xa1 && szBuf[5] == (char)0xb1 && + szBuf[6] == (char)0x1a && szBuf[7] == (char)0xe1 ) + { + return(UT_TRUE); + } + if ( szBuf[0] == (char)0xdb && szBuf[1] == (char)0xa5 && + szBuf[2] == (char)0x2d && szBuf[3] == (char)0 && + szBuf[4] == (char)0 && szBuf[5] == (char)0 ) + { + return(UT_TRUE); + } + } + return(UT_FALSE); +} + UT_Bool IE_Imp_MsWord_97::RecognizeSuffix(const char * szSuffix) { return (UT_stricmp(szSuffix,".doc") == 0); diff -u -d -r -P abi/src/wp/impexp/xp.orig/ie_imp_MsWord_97.h abi/src/wp/impexp/xp/ie_imp_MsWord_97.h --- abi/src/wp/impexp/xp.orig/ie_imp_MsWord_97.h Sat Feb 5 19:49:37 2000 +++ abi/src/wp/impexp/xp/ie_imp_MsWord_97.h Mon Feb 7 10:33:52 2000 @@ -43,6 +43,7 @@ virtual void pasteFromBuffer(PD_DocumentRange * pDocRange, unsigned char * pData, UT_uint32 lenData); + static UT_Bool RecognizeContents(const char * szBuf, int iNumbytes); static UT_Bool RecognizeSuffix(const char * szSuffix); static UT_Error StaticConstructor(PD_Document * pDocument, IE_Imp ** ppie); diff -u -d -r -P abi/src/wp/impexp/xp.orig/ie_imp_RTF.cpp abi/src/wp/impexp/xp/ie_imp_RTF.cpp --- abi/src/wp/impexp/xp.orig/ie_imp_RTF.cpp Fri Feb 4 14:47:07 2000 +++ abi/src/wp/impexp/xp/ie_imp_RTF.cpp Mon Feb 7 10:33:52 2000 @@ -319,6 +319,19 @@ /*****************************************************************/ /*****************************************************************/ +UT_Bool IE_Imp_RTF::RecognizeContents(const char * szBuf, int iNumbytes) +{ + if ( iNumbytes < 5 ) + { + return(UT_FALSE); + } + if ( strncmp( szBuf, "{\\rtf", 5 ) == 0 ) + { + return(UT_TRUE) ; + } + return(UT_FALSE); +} + UT_Bool IE_Imp_RTF::RecognizeSuffix(const char * szSuffix) { return (UT_stricmp(szSuffix,".rtf") == 0); diff -u -d -r -P abi/src/wp/impexp/xp.orig/ie_imp_RTF.h abi/src/wp/impexp/xp/ie_imp_RTF.h --- abi/src/wp/impexp/xp.orig/ie_imp_RTF.h Tue Feb 1 11:41:09 2000 +++ abi/src/wp/impexp/xp/ie_imp_RTF.h Mon Feb 7 10:33:52 2000 @@ -175,6 +175,7 @@ virtual void pasteFromBuffer(PD_DocumentRange * pDocRange, unsigned char * pData, UT_uint32 lenData); + static UT_Bool RecognizeContents(const char * szBuf, int iNumbytes); static UT_Bool RecognizeSuffix(const char * szSuffix); static UT_Error StaticConstructor(PD_Document * pDocument, IE_Imp ** ppie); diff -u -d -r -P abi/src/wp/impexp/xp.orig/ie_imp_Text.cpp abi/src/wp/impexp/xp/ie_imp_Text.cpp --- abi/src/wp/impexp/xp.orig/ie_imp_Text.cpp Wed Jan 26 16:22:45 2000 +++ abi/src/wp/impexp/xp/ie_imp_Text.cpp Mon Feb 7 11:03:49 2000 @@ -237,6 +237,14 @@ /*****************************************************************/ /*****************************************************************/ +UT_Bool IE_Imp_Text::RecognizeContents(const char * szBuf, int iNumbytes) +{ + // TODO: We give the other guys a chance, since this + // TODO: importer is so generic. Does this seem + // TODO: like a sensible strategy? + return(UT_FALSE); +} + UT_Bool IE_Imp_Text::RecognizeSuffix(const char * szSuffix) { return (UT_stricmp(szSuffix,".txt") == 0); diff -u -d -r -P abi/src/wp/impexp/xp.orig/ie_imp_Text.h abi/src/wp/impexp/xp/ie_imp_Text.h --- abi/src/wp/impexp/xp.orig/ie_imp_Text.h Wed Jan 26 16:22:45 2000 +++ abi/src/wp/impexp/xp/ie_imp_Text.h Mon Feb 7 10:33:52 2000 @@ -37,6 +37,7 @@ virtual void pasteFromBuffer(PD_DocumentRange * pDocRange, unsigned char * pData, UT_uint32 lenData); + static UT_Bool RecognizeContents(const char * szBuf, int iNumbytes); static UT_Bool RecognizeSuffix(const char * szSuffix); static UT_Error StaticConstructor(PD_Document * pDocument, IE_Imp ** ppie); diff -u -d -r -P abi/src/wp/impexp/xp.orig/ie_imp_UTF8.cpp abi/src/wp/impexp/xp/ie_imp_UTF8.cpp --- abi/src/wp/impexp/xp.orig/ie_imp_UTF8.cpp Wed Jan 26 16:22:45 2000 +++ abi/src/wp/impexp/xp/ie_imp_UTF8.cpp Mon Feb 7 11:04:05 2000 @@ -302,6 +302,12 @@ /*****************************************************************/ /*****************************************************************/ +UT_Bool IE_Imp_UTF8::RecognizeContents(const char * szBuf, int iNumbytes) +{ + // TODO: Not yet written + return(UT_FALSE); +} + UT_Bool IE_Imp_UTF8::RecognizeSuffix(const char * szSuffix) { return (UT_stricmp(szSuffix,".utf8") == 0); diff -u -d -r -P abi/src/wp/impexp/xp.orig/ie_imp_UTF8.h abi/src/wp/impexp/xp/ie_imp_UTF8.h --- abi/src/wp/impexp/xp.orig/ie_imp_UTF8.h Wed Jan 26 16:22:45 2000 +++ abi/src/wp/impexp/xp/ie_imp_UTF8.h Mon Feb 7 10:33:52 2000 @@ -37,6 +37,7 @@ virtual void pasteFromBuffer(PD_DocumentRange * pDocRange, unsigned char * pData, UT_uint32 lenData); + static UT_Bool RecognizeContents(const char * szBuf, int iNumbytes); static UT_Bool RecognizeSuffix(const char * szSuffix); static UT_Error StaticConstructor(PD_Document * pDocument, IE_Imp ** ppie);