diff --git a/tests/memory_benchmark.py b/tests/memory_benchmark.py index db83a99..d3d2b9e 100644 --- a/tests/memory_benchmark.py +++ b/tests/memory_benchmark.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# encoding: utf-8 # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. @@ -17,13 +16,14 @@ # # To run: # python tika/tests/memory_benchmark.py +import gzip import os import zlib -import gzip -import tika.parser from memory_profiler import profile +import tika.parser + @profile def test_parser_binary(): diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py index 88ddc5f..8994c5f 100644 --- a/tests/test_benchmark.py +++ b/tests/test_benchmark.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# encoding: utf-8 # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. @@ -16,11 +15,10 @@ # limitations under the License. # # pytest --benchmark-enable --benchmark-timer=time.process_time tika/tests/test_benchmark.py -# pytest --benchmark-enable --benchmark-timer=time.process_time tika/tests/test_benchmark.py +import gzip import os import unittest import zlib -import gzip from http import HTTPStatus import tika.parser diff --git a/tests/test_from_file_service.py b/tests/test_from_file_service.py index 7e73228..9831ff7 100644 --- a/tests/test_from_file_service.py +++ b/tests/test_from_file_service.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# encoding: utf-8 # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. @@ -17,12 +16,9 @@ # # python -m unittest tika.tests.test_from_file_service -import sys import unittest -if sys.version_info >= (3, 3): - from unittest import mock -else: - import mock +from unittest import mock + import tika.parser diff --git a/tests/test_tika.py b/tests/test_tika.py index c61cb81..707fbee 100644 --- a/tests/test_tika.py +++ b/tests/test_tika.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# encoding: utf-8 # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. diff --git a/tests/tests_params.py b/tests/tests_params.py index b9d25ae..41ed073 100644 --- a/tests/tests_params.py +++ b/tests/tests_params.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -# encoding: utf-8 + # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. @@ -19,14 +19,16 @@ #https://docs.python.org/2/library/unittest.html #http://eli.thegreenplace.net/2011/08/02/python-unit-testing-parametrized-test-cases #public domain license reference: http://eli.thegreenplace.net/pages/code - + #Run #python tika/tests/tests_params.py import csv import unittest + import tika.parser + class CreateTest(unittest.TestCase): "test for file types" def __init__(self, methodName='runTest', param1=None, param2=None): @@ -64,17 +66,17 @@ def test_suite(): try: suite.addTest(CreateTest.parameterize(RemoteTest,param1=x)) except IOError as e: - print(e.strerror) - return suite + print(e.strerror) + return suite def test_url(): with open('tika/tests/arguments/test_remote_content.csv', 'r') as csvfile: urlread = csv.reader(csvfile) for url in urlread: yield url[1] - + if __name__ == '__main__': suite = test_suite() - unittest.TextTestRunner(verbosity=2).run(suite) \ No newline at end of file + unittest.TextTestRunner(verbosity=2).run(suite) diff --git a/tests/tests_unpack.py b/tests/tests_unpack.py index 38a6b41..1c8e37b 100644 --- a/tests/tests_unpack.py +++ b/tests/tests_unpack.py @@ -1,7 +1,6 @@ -# coding=utf8 - import unittest from tempfile import NamedTemporaryFile + from tika import unpack diff --git a/tika/__init__.py b/tika/__init__.py index 4b0ad6f..39cbe7f 100644 --- a/tika/__init__.py +++ b/tika/__init__.py @@ -1,4 +1,3 @@ -# encoding: utf-8 # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. @@ -17,6 +16,7 @@ __version__ = "3.1.0" from pkgutil import extend_path + __path__ = extend_path(__path__, __name__) def initVM(): diff --git a/tika/config.py b/tika/config.py index 553f784..3eabf10 100644 --- a/tika/config.py +++ b/tika/config.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -# encoding: utf-8 + # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. @@ -14,10 +14,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -# +# from .tika import getConfig + def getParsers(): return getConfig('parsers')[1] diff --git a/tika/detector.py b/tika/detector.py index a844cc7..0d78133 100644 --- a/tika/detector.py +++ b/tika/detector.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# encoding: utf-8 # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. @@ -14,9 +13,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -# +# + +from .tika import ServerEndpoint, callServer, detectType1 -from .tika import detectType1, callServer, ServerEndpoint def from_file(filename, config_path=None, requestOptions={}): ''' diff --git a/tika/language.py b/tika/language.py index 2da3ce9..0280111 100644 --- a/tika/language.py +++ b/tika/language.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -# encoding: utf-8 + # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. @@ -14,9 +14,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -# +# + +from .tika import ServerEndpoint, callServer, detectLang1 -from .tika import detectLang1, callServer, ServerEndpoint def from_file(filename, requestOptions={}): ''' diff --git a/tika/parser.py b/tika/parser.py index a648ebc..231f28a 100644 --- a/tika/parser.py +++ b/tika/parser.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -# encoding: utf-8 + # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. @@ -16,9 +16,11 @@ # limitations under the License. # -from .tika import parse1, callServer, ServerEndpoint import json +from .tika import ServerEndpoint, callServer, parse1 + + def from_file(filename, serverEndpoint=ServerEndpoint, service='all', xmlContent=False, headers=None, config_path=None, requestOptions={}, raw_response=False): ''' Parses a file for metadata and content diff --git a/tika/pdf.py b/tika/pdf.py index b5daec5..7a99e2e 100644 --- a/tika/pdf.py +++ b/tika/pdf.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -# encoding: utf-8 + # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. @@ -16,10 +16,13 @@ # limitations under the License. # -from tika import parser from io import StringIO + from bs4 import BeautifulSoup +from tika import parser + + def text_from_pdf_pages(filename): pages_txt = [] diff --git a/tika/tika.py b/tika/tika.py index 594301b..cf2f33f 100755 --- a/tika/tika.py +++ b/tika/tika.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -# encoding: utf-8 + # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. @@ -88,7 +88,7 @@ Arguments: urlOrPathToFile = file to be parsed, if URL it will first be retrieved and then passed to Tika - + Switches: --verbose, -v = verbose mode --encode, -e = encode response in UTF-8 @@ -104,18 +104,14 @@ """ -import sys, os, getopt, time, codecs, re +import codecs +import getopt +import os +import re +import sys +import time from pathlib import Path -try: - unicode_string = unicode - binary_string = str -except NameError: - unicode_string = str - binary_string = bytes -try: - from urlparse import urlparse -except ImportError: - from urllib.parse import urlparse as urlparse +from urllib.parse import urlparse as urlparse try: from rfc6266 import build_header @@ -125,21 +121,18 @@ def make_content_disposition_header(fn): def make_content_disposition_header(fn): return 'attachment; filename=%s' % os.path.basename(fn) -if sys.version_info[0] < 3: - open = codecs.open - -import requests -import socket -import tempfile +import ctypes import hashlib +import io +import logging import platform -from subprocess import Popen -from subprocess import STDOUT -from os import walk import signal -import logging -import io -import ctypes +import socket +import tempfile +from os import walk +from subprocess import STDOUT, Popen + +import requests log_path = os.getenv('TIKA_LOG_PATH', tempfile.gettempdir()) log_file = os.path.join(log_path, os.getenv('TIKA_LOG_FILE', 'tika.log')) @@ -194,7 +187,7 @@ def make_content_disposition_header(fn): class TikaException(Exception): pass -def echo2(*s): sys.stderr.write(unicode_string('tika.py: %s\n') % unicode_string(' ').join(map(unicode_string, s))) +def echo2(*s): sys.stderr.write(str('tika.py: %s\n') % str(' ').join(map(str, s))) def warn(*s): echo2('Warn:', *s) def die(*s): warn('Error:', *s); echo2(USAGE); sys.exit() @@ -226,7 +219,7 @@ def runCommand(cmd, option, urlOrPaths, port, outDir=None, elif cmd == "language": return detectLang(option, urlOrPaths, serverEndpoint, verbose, tikaServerJar) elif cmd == "translate": - return doTranslate(option, urlOrPaths, serverEndpoint, verbose, tikaServerJar) + return doTranslate(option, urlOrPaths, serverEndpoint, verbose, tikaServerJar) elif cmd == "config": status, resp = getConfig(option, serverEndpoint, verbose, tikaServerJar) return resp @@ -243,7 +236,7 @@ def getPaths(urlOrPaths): :param urlOrPaths: the url or path to be scanned :return: ``list`` of paths ''' - if isinstance(urlOrPaths, unicode_string): + if isinstance(urlOrPaths, str): urlOrPaths = [urlOrPaths] # do not recursively walk over letters of a single path which can include "/" paths = [] for eachUrlOrPaths in urlOrPaths: @@ -287,7 +280,7 @@ def parseAndSave(option, urlOrPaths, outDir=None, serverEndpoint=ServerEndpoint, return metaPaths -def parse(option, urlOrPaths, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar, +def parse(option, urlOrPaths, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar, responseMimeType='application/json', services={'meta': '/meta', 'text': '/tika', 'all': '/rmeta'}, rawResponse=False): ''' @@ -323,13 +316,13 @@ def parse1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbose, ti headers = headers or {} path, file_type = getRemoteFile(urlOrPath, TikaFilesPath) - headers.update({'Accept': responseMimeType, 'Content-Disposition': make_content_disposition_header(path.encode('utf-8') if type(path) is unicode_string else path)}) + headers.update({'Accept': responseMimeType, 'Content-Disposition': make_content_disposition_header(path.encode('utf-8') if type(path) is str else path)}) if option not in services: log.warning('config option must be one of meta, text, or all; using all.') service = services.get(option, services['all']) if service == '/tika': responseMimeType = 'text/plain' - headers.update({'Accept': responseMimeType, 'Content-Disposition': make_content_disposition_header(path.encode('utf-8') if type(path) is unicode_string else path)}) + headers.update({'Accept': responseMimeType, 'Content-Disposition': make_content_disposition_header(path.encode('utf-8') if type(path) is str else path)}) with urlOrPath if _is_file_object(urlOrPath) else open(path, 'rb') as f: status, response = callServer('put', serverEndpoint, service, f, headers, verbose, tikaServerJar, config_path=config_path, @@ -356,7 +349,7 @@ def detectLang(option, urlOrPaths, serverEndpoint=ServerEndpoint, verbose=Verbos return [detectLang1(option, path, serverEndpoint, verbose, tikaServerJar, responseMimeType, services) for path in paths] -def detectLang1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar, +def detectLang1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar, responseMimeType='text/plain', services={'file' : '/language/stream'}, requestOptions={}): ''' @@ -372,14 +365,14 @@ def detectLang1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbos ''' path, mode = getRemoteFile(urlOrPath, TikaFilesPath) if option not in services: - log.exception('Language option must be one of %s ' % binary_string(services.keys())) - raise TikaException('Language option must be one of %s ' % binary_string(services.keys())) + log.exception('Language option must be one of %s ' % bytes(services.keys())) + raise TikaException('Language option must be one of %s ' % bytes(services.keys())) service = services[option] status, response = callServer('put', serverEndpoint, service, open(path, 'rb'), {'Accept': responseMimeType}, verbose, tikaServerJar, requestOptions=requestOptions) return (status, response) -def doTranslate(option, urlOrPaths, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar, +def doTranslate(option, urlOrPaths, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar, responseMimeType='text/plain', services={'all': '/translate/all'}): ''' @@ -396,9 +389,9 @@ def doTranslate(option, urlOrPaths, serverEndpoint=ServerEndpoint, verbose=Verbo paths = getPaths(urlOrPaths) return [doTranslate1(option, path, serverEndpoint, verbose, tikaServerJar, responseMimeType, services) for path in paths] - + def doTranslate1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar, - responseMimeType='text/plain', + responseMimeType='text/plain', services={'all': '/translate/all'}, requestOptions={}): ''' @@ -414,7 +407,7 @@ def doTranslate1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbo path, mode = getRemoteFile(urlOrPath, TikaFilesPath) srcLang = "" destLang = "" - + if ":" in option: options = option.rsplit(':') srcLang = options[0] @@ -424,17 +417,17 @@ def doTranslate1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbo raise TikaException('Translate options are specified as srcLang:destLang or as destLang') else: destLang = option - + if srcLang != "" and destLang != "": service = services["all"] + "/" + Translator + "/" + srcLang + "/" + destLang else: - service = services["all"] + "/" + Translator + "/" + destLang + service = services["all"] + "/" + Translator + "/" + destLang status, response = callServer('put', serverEndpoint, service, open(path, 'rb'), {'Accept' : responseMimeType}, verbose, tikaServerJar, requestOptions=requestOptions) return (status, response) - -def detectType(option, urlOrPaths, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar, + +def detectType(option, urlOrPaths, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar, responseMimeType='text/plain', services={'type': '/detect/stream'}): ''' @@ -452,7 +445,7 @@ def detectType(option, urlOrPaths, serverEndpoint=ServerEndpoint, verbose=Verbos return [detectType1(option, path, serverEndpoint, verbose, tikaServerJar, responseMimeType, services) for path in paths] -def detectType1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar, +def detectType1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar, responseMimeType='text/plain', services={'type': '/detect/stream'}, config_path=None, requestOptions={}): ''' @@ -468,13 +461,13 @@ def detectType1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbos ''' path, mode = getRemoteFile(urlOrPath, TikaFilesPath) if option not in services: - log.exception('Detect option must be one of %s' % binary_string(services.keys())) - raise TikaException('Detect option must be one of %s' % binary_string(services.keys())) + log.exception('Detect option must be one of %s' % bytes(services.keys())) + raise TikaException('Detect option must be one of %s' % bytes(services.keys())) service = services[option] status, response = callServer('put', serverEndpoint, service, open(path, 'rb'), { 'Accept': responseMimeType, - 'Content-Disposition': make_content_disposition_header(path.encode('utf-8') if type(path) is unicode_string else path) + 'Content-Disposition': make_content_disposition_header(path.encode('utf-8') if type(path) is str else path) }, verbose, tikaServerJar, config_path=config_path, requestOptions=requestOptions) if csvOutput == 1: @@ -516,29 +509,29 @@ def callServer(verb, serverEndpoint, service, data, headers, verbose=Verbose, ti :param classpath: :return: ''' - parsedUrl = urlparse(serverEndpoint) + parsedUrl = urlparse(serverEndpoint) serverHost = parsedUrl.hostname scheme = parsedUrl.scheme port = parsedUrl.port if classpath is None: classpath = TikaServerClasspath - + global TikaClientOnly if not TikaClientOnly: serverEndpoint = checkTikaServer(scheme, serverHost, port, tikaServerJar, classpath, config_path) serviceUrl = serverEndpoint + service if verb not in httpVerbs: - log.exception('Tika Server call must be one of %s' % binary_string(httpVerbs.keys())) - raise TikaException('Tika Server call must be one of %s' % binary_string(httpVerbs.keys())) + log.exception('Tika Server call must be one of %s' % bytes(httpVerbs.keys())) + raise TikaException('Tika Server call must be one of %s' % bytes(httpVerbs.keys())) verbFn = httpVerbs[verb] if Windows and hasattr(data, "read"): data = data.read() - + encodedData = data - if type(data) is unicode_string: + if type(data) is str: encodedData = data.encode('utf-8') requestOptionsDefault = { @@ -673,7 +666,7 @@ def startServer(tikaServerJar, java_path = TikaJava, java_args = TikaJavaArgs, s # Patch for Windows support if Windows: if sys.version.startswith("2"): - # Python 2.x + # Python 2.x TikaServerProcess = Popen(cmd_string, stdout=logFile, stderr=STDOUT, shell=True) elif sys.version.startswith("3"): # Python 3.x @@ -709,7 +702,7 @@ def killServer(): try: os.killpg(os.getpgid(TikaServerProcess.pid), signal.SIGTERM) except: - log.error("Failed to kill the current server session") + log.error("Failed to kill the current server session") time.sleep(1) # patch to support subprocess killing for windows if Windows: @@ -728,7 +721,7 @@ def killServer(): try: os.killpg(os.getpgid(TikaServerProcess.pid), signal.SIGTERM) except: - log.error("Failed to kill the current server session") + log.error("Failed to kill the current server session") time.sleep(1) else: log.error("Server not running, or was already running before") @@ -846,7 +839,7 @@ def getRemoteJar(urlOrPath, destPath): log.info('Retrieving %s to %s.' % (urlOrPath, destPath)) _urlretrieve(urlOrPath, destPath) return (destPath, 'remote') - + def checkPortIsOpen(remoteServerHost=ServerHost, port = Port): ''' Checks if the specified port is open diff --git a/tika/translate.py b/tika/translate.py index c781fb7..beb57c7 100644 --- a/tika/translate.py +++ b/tika/translate.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# encoding: utf-8 # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. @@ -16,7 +15,8 @@ # limitations under the License. # -from .tika import doTranslate1, callServer, Translator, ServerEndpoint +from .tika import ServerEndpoint, Translator, callServer, doTranslate1 + def from_file(filename, srcLang, destLang, serverEndpoint=ServerEndpoint, requestOptions={}): ''' diff --git a/tika/unpack.py b/tika/unpack.py index acc0b58..0cdcba0 100644 --- a/tika/unpack.py +++ b/tika/unpack.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# encoding: utf-8 # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. @@ -16,12 +15,13 @@ # limitations under the License. # -from .tika import parse1, callServer, ServerEndpoint +import csv import tarfile +from contextlib import closing from io import BytesIO, TextIOWrapper -import csv from sys import version_info -from contextlib import closing + +from .tika import ServerEndpoint, callServer, parse1 # Python 3 introduced .readable() to tarfile extracted files objects - this # is required to wrap a TextIOWrapper around the object. However, wrapping