This article applies to the use of the
If you use the FAST Enterprise Crawler's document type detection, you may experience that documents that are in the Office 2010 and newer formats are detected as
. One way to include these documents in the crawl, is to allow these formats as a supported document type. Here are some useful tricks on how to get the most of the Office document types, without including all
import
pyuriparse
from
pylib
import
Logger
import
re
LOG_ONCE
=
1
# Set to 1 to get the init pattern.
DEBUG_10
=
1
# Set to some value > 0 to get a limited number of msgs. E.g. 10 :)
class
CrawlerFilterZip:
"""
Reject documents that are application/zip, but don't have an office extension.
"""
def
__init__(
self
):
global
LOG_ONCE
self
.RE
=
"(?i)\.((ppt|doc|xls)[x]?|ods|odp|odt)$"
self
.officeExtensionRE
=
re.compile(
self
.RE)
if
LOG_ONCE>
0
:
log(log.FLOG_INFO,
'CrawlerFilterZip.__init__() - URI path (params removed) must match "%s"'
%
self
.RE)
LOG_ONCE
=
0
##################################################################
# CrawlerFilterZip.process(doc) - Filter (remove) zip files that are not office docs.
# Crawler plugin, processing documents scheduled for link extraction.
# Simple attempt at filtering documents that are zip files, but not office docs.
# Using file extension as method to filter out non-office docs (!)
# Looking at the URI path after parameter removal.
#-----------------------------------------------------------------
# Expect:
# doc.mimetype - e.g. application/zip
# Simple attempt at filtering documents that are zip files, but not office docs.
# Using file extension as method to filter out non-office docs (!)
# Looking at the URI path after parameter removal.
#-----------------------------------------------------------------
# Expect:
# doc.code style="color:#008200;"># doc.data - the downloaded document - used to check for zip signature
# doc.store_document - indicate if the document is to be stored or not
# For non-office docs - want to set this to 0
# NOTE: Don't call this if already known that store_doc should be 0 (e.g. other plugins)
##################################################################
def
process(
self
, doc):
global
DEBUG_10
doc.store_document
=
1
try
:
# log(log.FLOG_INFO, 'CrawlerFilterZip.process() - Checking "%s, %s, %s"' % (doc.uri, doc.mimetype, repr(doc.data[0:100])))
# Only check documents returned as ZIP from server, or if have ZIP signature:
if
doc.mimetype
=
=
'application/zip'
or
doc.data[
0
:
4
]
=
=
'PK\003\004'
:
# log(log.FLOG_INFO, 'CrawlerFilterZip.process() - Checking "%s, %s, %s"' % (doc.uri, doc.mimetype, repr(doc.data[0:100])))
doc.store_document
=
0
uriParts
=
pyuriparse.uriparse(doc.uri)
matches
=
self
.officeExtensionRE.findall(uriParts[pyuriparse.URI_PATH])
if
len(matches)>
0
:
log(log.FLOG_INFO,
'CrawlerFilterZip.process() - Passed "%s, %s, %s"'
%
(doc.uri, doc.mimetype, repr(doc.data[
0
:
4
])))
doc.store_document
=
1
else
:
log(log.FLOG_WARNING,
'CrawlerFilterZip.process() - Rejected "%s, %s, %s"'
%
(doc.uri, doc.mimetype, repr(doc.data[
0
:
4
])))
doc.errmsg
=
"CrawlerFilterZip: %s failed OFFICE extension check"
%
uriParts[pyuriparse.URI_PATH]
else
:
# Not checking non-zip documents.
if
DEBUG_10>
0
:
log(log.FLOG_INFO,
'CrawlerFilterZip.process() - Not checking "%s, %s, %s"'
%
(doc.uri, doc.mimetype, repr(doc.data[
0
:
4
])))
DEBUG_10
-
=
1
except
:
try
:
log_exc(log.FLOG_ERROR,
'CrawlerFilterZip.process() - Unable to check %s'
%
doc.uri)
doc.store_document
=
0
doc.errmsg
=
"CrawlerFilterZip: %s caught exception."
%
self
.uri
log_exc(log.FLOG_ERROR,
'CrawlerFilterZip.process() - Unable to check %s'
%
doc.uri)
doc.store_document
=
0
doc.errmsg
=;">
except
:
log_exc(log.FLOG_ERROR,
'CrawlerFilterZip.process() - Passing weird document'
)
pass
The document will then still be passed on as application/zip to the FAST ESP Pipeline, but the standard format detector there will identify it as a Word/PowerPoint/Excel document and handle it properly.