From a3b7a04c2b4998c56eec18c650efe96b76079a10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Heinz-Alexander=20F=C3=BCtterer?= <35225576+afuetterer@users.noreply.github.com> Date: Mon, 9 Mar 2026 08:53:57 +0100 Subject: [PATCH 1/4] style: sort import statements using ruff --- tests/memory_benchmark.py | 5 +++-- tests/test_benchmark.py | 3 +-- tests/test_from_file_service.py | 1 + tests/tests_params.py | 2 ++ tests/tests_unpack.py | 1 + tika/__init__.py | 1 + tika/config.py | 1 + tika/detector.py | 3 ++- tika/language.py | 3 ++- tika/parser.py | 4 +++- tika/pdf.py | 5 ++++- tika/tika.py | 26 ++++++++++++++++---------- tika/translate.py | 3 ++- tika/unpack.py | 7 ++++--- 14 files changed, 43 insertions(+), 22 deletions(-) diff --git a/tests/memory_benchmark.py b/tests/memory_benchmark.py index db83a994..77ac414a 100644 --- a/tests/memory_benchmark.py +++ b/tests/memory_benchmark.py @@ -17,13 +17,14 @@ # # To run: # python tika/tests/memory_benchmark.py +import gzip import os import zlib -import gzip -import tika.parser from memory_profiler import profile +import tika.parser + @profile def test_parser_binary(): diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py index 88ddc5fd..12a7441b 100644 --- a/tests/test_benchmark.py +++ b/tests/test_benchmark.py @@ -16,11 +16,10 @@ # limitations under the License. # # pytest --benchmark-enable --benchmark-timer=time.process_time tika/tests/test_benchmark.py -# pytest --benchmark-enable --benchmark-timer=time.process_time tika/tests/test_benchmark.py +import gzip import os import unittest import zlib -import gzip from http import HTTPStatus import tika.parser diff --git a/tests/test_from_file_service.py b/tests/test_from_file_service.py index 7e732289..3b982a74 100644 --- a/tests/test_from_file_service.py +++ b/tests/test_from_file_service.py @@ -19,6 +19,7 @@ import sys import unittest + if sys.version_info >= (3, 3): from unittest import mock else: diff --git a/tests/tests_params.py b/tests/tests_params.py index b9d25ae8..b9e4afeb 100644 --- a/tests/tests_params.py +++ b/tests/tests_params.py @@ -25,8 +25,10 @@ import csv import unittest + import tika.parser + class CreateTest(unittest.TestCase): "test for file types" def __init__(self, methodName='runTest', param1=None, param2=None): diff --git a/tests/tests_unpack.py b/tests/tests_unpack.py index 38a6b415..ade748e4 100644 --- a/tests/tests_unpack.py +++ b/tests/tests_unpack.py @@ -2,6 +2,7 @@ import unittest from tempfile import NamedTemporaryFile + from tika import unpack diff --git a/tika/__init__.py b/tika/__init__.py index 4b0ad6fb..b77ebdd7 100644 --- a/tika/__init__.py +++ b/tika/__init__.py @@ -17,6 +17,7 @@ __version__ = "3.1.0" from pkgutil import extend_path + __path__ = extend_path(__path__, __name__) def initVM(): diff --git a/tika/config.py b/tika/config.py index 553f7845..d187b7f1 100644 --- a/tika/config.py +++ b/tika/config.py @@ -18,6 +18,7 @@ from .tika import getConfig + def getParsers(): return getConfig('parsers')[1] diff --git a/tika/detector.py b/tika/detector.py index a844cc7a..d86335a3 100644 --- a/tika/detector.py +++ b/tika/detector.py @@ -16,7 +16,8 @@ # limitations under the License. # -from .tika import detectType1, callServer, ServerEndpoint +from .tika import ServerEndpoint, callServer, detectType1 + def from_file(filename, config_path=None, requestOptions={}): ''' diff --git a/tika/language.py b/tika/language.py index 2da3ce9e..e7106293 100644 --- a/tika/language.py +++ b/tika/language.py @@ -16,7 +16,8 @@ # limitations under the License. # -from .tika import detectLang1, callServer, ServerEndpoint +from .tika import ServerEndpoint, callServer, detectLang1 + def from_file(filename, requestOptions={}): ''' diff --git a/tika/parser.py b/tika/parser.py index a648ebca..1632f2eb 100644 --- a/tika/parser.py +++ b/tika/parser.py @@ -16,9 +16,11 @@ # limitations under the License. # -from .tika import parse1, callServer, ServerEndpoint import json +from .tika import ServerEndpoint, callServer, parse1 + + def from_file(filename, serverEndpoint=ServerEndpoint, service='all', xmlContent=False, headers=None, config_path=None, requestOptions={}, raw_response=False): ''' Parses a file for metadata and content diff --git a/tika/pdf.py b/tika/pdf.py index b5daec50..e0efefc4 100644 --- a/tika/pdf.py +++ b/tika/pdf.py @@ -16,10 +16,13 @@ # limitations under the License. # -from tika import parser from io import StringIO + from bs4 import BeautifulSoup +from tika import parser + + def text_from_pdf_pages(filename): pages_txt = [] diff --git a/tika/tika.py b/tika/tika.py index 594301bf..9122b981 100755 --- a/tika/tika.py +++ b/tika/tika.py @@ -104,8 +104,14 @@ """ -import sys, os, getopt, time, codecs, re +import codecs +import getopt +import os +import re +import sys +import time from pathlib import Path + try: unicode_string = unicode binary_string = str @@ -128,18 +134,18 @@ def make_content_disposition_header(fn): if sys.version_info[0] < 3: open = codecs.open -import requests -import socket -import tempfile +import ctypes import hashlib +import io +import logging import platform -from subprocess import Popen -from subprocess import STDOUT -from os import walk import signal -import logging -import io -import ctypes +import socket +import tempfile +from os import walk +from subprocess import STDOUT, Popen + +import requests log_path = os.getenv('TIKA_LOG_PATH', tempfile.gettempdir()) log_file = os.path.join(log_path, os.getenv('TIKA_LOG_FILE', 'tika.log')) diff --git a/tika/translate.py b/tika/translate.py index c781fb73..ea95548d 100644 --- a/tika/translate.py +++ b/tika/translate.py @@ -16,7 +16,8 @@ # limitations under the License. # -from .tika import doTranslate1, callServer, Translator, ServerEndpoint +from .tika import ServerEndpoint, Translator, callServer, doTranslate1 + def from_file(filename, srcLang, destLang, serverEndpoint=ServerEndpoint, requestOptions={}): ''' diff --git a/tika/unpack.py b/tika/unpack.py index acc0b58c..bd0bf632 100644 --- a/tika/unpack.py +++ b/tika/unpack.py @@ -16,12 +16,13 @@ # limitations under the License. # -from .tika import parse1, callServer, ServerEndpoint +import csv import tarfile +from contextlib import closing from io import BytesIO, TextIOWrapper -import csv from sys import version_info -from contextlib import closing + +from .tika import ServerEndpoint, callServer, parse1 # Python 3 introduced .readable() to tarfile extracted files objects - this # is required to wrap a TextIOWrapper around the object. However, wrapping From 170d5320519226b09b1622e8c57ba7a1b2b959a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Heinz-Alexander=20F=C3=BCtterer?= <35225576+afuetterer@users.noreply.github.com> Date: Mon, 9 Mar 2026 08:55:50 +0100 Subject: [PATCH 2/4] remove sys.version_info --- tests/test_from_file_service.py | 6 +----- tika/tika.py | 3 --- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/tests/test_from_file_service.py b/tests/test_from_file_service.py index 3b982a74..4c83b9b6 100644 --- a/tests/test_from_file_service.py +++ b/tests/test_from_file_service.py @@ -17,13 +17,9 @@ # # python -m unittest tika.tests.test_from_file_service -import sys import unittest +from unittest import mock -if sys.version_info >= (3, 3): - from unittest import mock -else: - import mock import tika.parser diff --git a/tika/tika.py b/tika/tika.py index 9122b981..f3b314b3 100755 --- a/tika/tika.py +++ b/tika/tika.py @@ -131,9 +131,6 @@ def make_content_disposition_header(fn): def make_content_disposition_header(fn): return 'attachment; filename=%s' % os.path.basename(fn) -if sys.version_info[0] < 3: - open = codecs.open - import ctypes import hashlib import io From 5265371c0c8da3bb91fe72a242f3cf867d33a5e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Heinz-Alexander=20F=C3=BCtterer?= <35225576+afuetterer@users.noreply.github.com> Date: Mon, 9 Mar 2026 08:58:48 +0100 Subject: [PATCH 3/4] style: remove obsolete source encoding --- tests/memory_benchmark.py | 1 - tests/test_benchmark.py | 1 - tests/test_from_file_service.py | 1 - tests/test_tika.py | 1 - tests/tests_params.py | 12 ++++----- tests/tests_unpack.py | 2 -- tika/__init__.py | 1 - tika/config.py | 4 +-- tika/detector.py | 3 +-- tika/language.py | 4 +-- tika/parser.py | 2 +- tika/pdf.py | 2 +- tika/tika.py | 44 ++++++++++++++++----------------- tika/translate.py | 1 - tika/unpack.py | 1 - 15 files changed, 35 insertions(+), 45 deletions(-) diff --git a/tests/memory_benchmark.py b/tests/memory_benchmark.py index 77ac414a..d3d2b9e2 100644 --- a/tests/memory_benchmark.py +++ b/tests/memory_benchmark.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# encoding: utf-8 # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py index 12a7441b..8994c5fd 100644 --- a/tests/test_benchmark.py +++ b/tests/test_benchmark.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# encoding: utf-8 # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. diff --git a/tests/test_from_file_service.py b/tests/test_from_file_service.py index 4c83b9b6..9831ff78 100644 --- a/tests/test_from_file_service.py +++ b/tests/test_from_file_service.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# encoding: utf-8 # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. diff --git a/tests/test_tika.py b/tests/test_tika.py index c61cb812..707fbee1 100644 --- a/tests/test_tika.py +++ b/tests/test_tika.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# encoding: utf-8 # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. diff --git a/tests/tests_params.py b/tests/tests_params.py index b9e4afeb..41ed0738 100644 --- a/tests/tests_params.py +++ b/tests/tests_params.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -# encoding: utf-8 + # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. @@ -19,7 +19,7 @@ #https://docs.python.org/2/library/unittest.html #http://eli.thegreenplace.net/2011/08/02/python-unit-testing-parametrized-test-cases #public domain license reference: http://eli.thegreenplace.net/pages/code - + #Run #python tika/tests/tests_params.py @@ -66,17 +66,17 @@ def test_suite(): try: suite.addTest(CreateTest.parameterize(RemoteTest,param1=x)) except IOError as e: - print(e.strerror) - return suite + print(e.strerror) + return suite def test_url(): with open('tika/tests/arguments/test_remote_content.csv', 'r') as csvfile: urlread = csv.reader(csvfile) for url in urlread: yield url[1] - + if __name__ == '__main__': suite = test_suite() - unittest.TextTestRunner(verbosity=2).run(suite) \ No newline at end of file + unittest.TextTestRunner(verbosity=2).run(suite) diff --git a/tests/tests_unpack.py b/tests/tests_unpack.py index ade748e4..1c8e37bb 100644 --- a/tests/tests_unpack.py +++ b/tests/tests_unpack.py @@ -1,5 +1,3 @@ -# coding=utf8 - import unittest from tempfile import NamedTemporaryFile diff --git a/tika/__init__.py b/tika/__init__.py index b77ebdd7..39cbe7f5 100644 --- a/tika/__init__.py +++ b/tika/__init__.py @@ -1,4 +1,3 @@ -# encoding: utf-8 # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. diff --git a/tika/config.py b/tika/config.py index d187b7f1..3eabf107 100644 --- a/tika/config.py +++ b/tika/config.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -# encoding: utf-8 + # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. @@ -14,7 +14,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -# +# from .tika import getConfig diff --git a/tika/detector.py b/tika/detector.py index d86335a3..0d78133d 100644 --- a/tika/detector.py +++ b/tika/detector.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# encoding: utf-8 # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. @@ -14,7 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -# +# from .tika import ServerEndpoint, callServer, detectType1 diff --git a/tika/language.py b/tika/language.py index e7106293..02801119 100644 --- a/tika/language.py +++ b/tika/language.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -# encoding: utf-8 + # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. @@ -14,7 +14,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -# +# from .tika import ServerEndpoint, callServer, detectLang1 diff --git a/tika/parser.py b/tika/parser.py index 1632f2eb..231f28ad 100644 --- a/tika/parser.py +++ b/tika/parser.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -# encoding: utf-8 + # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. diff --git a/tika/pdf.py b/tika/pdf.py index e0efefc4..7a99e2e7 100644 --- a/tika/pdf.py +++ b/tika/pdf.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -# encoding: utf-8 + # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. diff --git a/tika/tika.py b/tika/tika.py index f3b314b3..66fe6567 100755 --- a/tika/tika.py +++ b/tika/tika.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -# encoding: utf-8 + # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. @@ -88,7 +88,7 @@ Arguments: urlOrPathToFile = file to be parsed, if URL it will first be retrieved and then passed to Tika - + Switches: --verbose, -v = verbose mode --encode, -e = encode response in UTF-8 @@ -113,7 +113,7 @@ from pathlib import Path try: - unicode_string = unicode + unicode_string = unicode binary_string = str except NameError: unicode_string = str @@ -229,7 +229,7 @@ def runCommand(cmd, option, urlOrPaths, port, outDir=None, elif cmd == "language": return detectLang(option, urlOrPaths, serverEndpoint, verbose, tikaServerJar) elif cmd == "translate": - return doTranslate(option, urlOrPaths, serverEndpoint, verbose, tikaServerJar) + return doTranslate(option, urlOrPaths, serverEndpoint, verbose, tikaServerJar) elif cmd == "config": status, resp = getConfig(option, serverEndpoint, verbose, tikaServerJar) return resp @@ -290,7 +290,7 @@ def parseAndSave(option, urlOrPaths, outDir=None, serverEndpoint=ServerEndpoint, return metaPaths -def parse(option, urlOrPaths, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar, +def parse(option, urlOrPaths, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar, responseMimeType='application/json', services={'meta': '/meta', 'text': '/tika', 'all': '/rmeta'}, rawResponse=False): ''' @@ -359,7 +359,7 @@ def detectLang(option, urlOrPaths, serverEndpoint=ServerEndpoint, verbose=Verbos return [detectLang1(option, path, serverEndpoint, verbose, tikaServerJar, responseMimeType, services) for path in paths] -def detectLang1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar, +def detectLang1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar, responseMimeType='text/plain', services={'file' : '/language/stream'}, requestOptions={}): ''' @@ -382,7 +382,7 @@ def detectLang1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbos {'Accept': responseMimeType}, verbose, tikaServerJar, requestOptions=requestOptions) return (status, response) -def doTranslate(option, urlOrPaths, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar, +def doTranslate(option, urlOrPaths, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar, responseMimeType='text/plain', services={'all': '/translate/all'}): ''' @@ -399,9 +399,9 @@ def doTranslate(option, urlOrPaths, serverEndpoint=ServerEndpoint, verbose=Verbo paths = getPaths(urlOrPaths) return [doTranslate1(option, path, serverEndpoint, verbose, tikaServerJar, responseMimeType, services) for path in paths] - + def doTranslate1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar, - responseMimeType='text/plain', + responseMimeType='text/plain', services={'all': '/translate/all'}, requestOptions={}): ''' @@ -417,7 +417,7 @@ def doTranslate1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbo path, mode = getRemoteFile(urlOrPath, TikaFilesPath) srcLang = "" destLang = "" - + if ":" in option: options = option.rsplit(':') srcLang = options[0] @@ -427,17 +427,17 @@ def doTranslate1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbo raise TikaException('Translate options are specified as srcLang:destLang or as destLang') else: destLang = option - + if srcLang != "" and destLang != "": service = services["all"] + "/" + Translator + "/" + srcLang + "/" + destLang else: - service = services["all"] + "/" + Translator + "/" + destLang + service = services["all"] + "/" + Translator + "/" + destLang status, response = callServer('put', serverEndpoint, service, open(path, 'rb'), {'Accept' : responseMimeType}, verbose, tikaServerJar, requestOptions=requestOptions) return (status, response) - -def detectType(option, urlOrPaths, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar, + +def detectType(option, urlOrPaths, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar, responseMimeType='text/plain', services={'type': '/detect/stream'}): ''' @@ -455,7 +455,7 @@ def detectType(option, urlOrPaths, serverEndpoint=ServerEndpoint, verbose=Verbos return [detectType1(option, path, serverEndpoint, verbose, tikaServerJar, responseMimeType, services) for path in paths] -def detectType1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar, +def detectType1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar, responseMimeType='text/plain', services={'type': '/detect/stream'}, config_path=None, requestOptions={}): ''' @@ -519,14 +519,14 @@ def callServer(verb, serverEndpoint, service, data, headers, verbose=Verbose, ti :param classpath: :return: ''' - parsedUrl = urlparse(serverEndpoint) + parsedUrl = urlparse(serverEndpoint) serverHost = parsedUrl.hostname scheme = parsedUrl.scheme port = parsedUrl.port if classpath is None: classpath = TikaServerClasspath - + global TikaClientOnly if not TikaClientOnly: serverEndpoint = checkTikaServer(scheme, serverHost, port, tikaServerJar, classpath, config_path) @@ -539,7 +539,7 @@ def callServer(verb, serverEndpoint, service, data, headers, verbose=Verbose, ti if Windows and hasattr(data, "read"): data = data.read() - + encodedData = data if type(data) is unicode_string: encodedData = data.encode('utf-8') @@ -676,7 +676,7 @@ def startServer(tikaServerJar, java_path = TikaJava, java_args = TikaJavaArgs, s # Patch for Windows support if Windows: if sys.version.startswith("2"): - # Python 2.x + # Python 2.x TikaServerProcess = Popen(cmd_string, stdout=logFile, stderr=STDOUT, shell=True) elif sys.version.startswith("3"): # Python 3.x @@ -712,7 +712,7 @@ def killServer(): try: os.killpg(os.getpgid(TikaServerProcess.pid), signal.SIGTERM) except: - log.error("Failed to kill the current server session") + log.error("Failed to kill the current server session") time.sleep(1) # patch to support subprocess killing for windows if Windows: @@ -731,7 +731,7 @@ def killServer(): try: os.killpg(os.getpgid(TikaServerProcess.pid), signal.SIGTERM) except: - log.error("Failed to kill the current server session") + log.error("Failed to kill the current server session") time.sleep(1) else: log.error("Server not running, or was already running before") @@ -849,7 +849,7 @@ def getRemoteJar(urlOrPath, destPath): log.info('Retrieving %s to %s.' % (urlOrPath, destPath)) _urlretrieve(urlOrPath, destPath) return (destPath, 'remote') - + def checkPortIsOpen(remoteServerHost=ServerHost, port = Port): ''' Checks if the specified port is open diff --git a/tika/translate.py b/tika/translate.py index ea95548d..beb57c74 100644 --- a/tika/translate.py +++ b/tika/translate.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# encoding: utf-8 # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. diff --git a/tika/unpack.py b/tika/unpack.py index bd0bf632..0cdcba09 100644 --- a/tika/unpack.py +++ b/tika/unpack.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# encoding: utf-8 # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. From 7d0ef7080ab1a8267530c053474ec1b6bb650946 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Heinz-Alexander=20F=C3=BCtterer?= <35225576+afuetterer@users.noreply.github.com> Date: Mon, 9 Mar 2026 09:13:05 +0100 Subject: [PATCH 4/4] unicode_string --- tika/tika.py | 36 +++++++++++++----------------------- 1 file changed, 13 insertions(+), 23 deletions(-) diff --git a/tika/tika.py b/tika/tika.py index 66fe6567..cf2f33fa 100755 --- a/tika/tika.py +++ b/tika/tika.py @@ -111,17 +111,7 @@ import sys import time from pathlib import Path - -try: - unicode_string = unicode - binary_string = str -except NameError: - unicode_string = str - binary_string = bytes -try: - from urlparse import urlparse -except ImportError: - from urllib.parse import urlparse as urlparse +from urllib.parse import urlparse as urlparse try: from rfc6266 import build_header @@ -197,7 +187,7 @@ def make_content_disposition_header(fn): class TikaException(Exception): pass -def echo2(*s): sys.stderr.write(unicode_string('tika.py: %s\n') % unicode_string(' ').join(map(unicode_string, s))) +def echo2(*s): sys.stderr.write(str('tika.py: %s\n') % str(' ').join(map(str, s))) def warn(*s): echo2('Warn:', *s) def die(*s): warn('Error:', *s); echo2(USAGE); sys.exit() @@ -246,7 +236,7 @@ def getPaths(urlOrPaths): :param urlOrPaths: the url or path to be scanned :return: ``list`` of paths ''' - if isinstance(urlOrPaths, unicode_string): + if isinstance(urlOrPaths, str): urlOrPaths = [urlOrPaths] # do not recursively walk over letters of a single path which can include "/" paths = [] for eachUrlOrPaths in urlOrPaths: @@ -326,13 +316,13 @@ def parse1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbose, ti headers = headers or {} path, file_type = getRemoteFile(urlOrPath, TikaFilesPath) - headers.update({'Accept': responseMimeType, 'Content-Disposition': make_content_disposition_header(path.encode('utf-8') if type(path) is unicode_string else path)}) + headers.update({'Accept': responseMimeType, 'Content-Disposition': make_content_disposition_header(path.encode('utf-8') if type(path) is str else path)}) if option not in services: log.warning('config option must be one of meta, text, or all; using all.') service = services.get(option, services['all']) if service == '/tika': responseMimeType = 'text/plain' - headers.update({'Accept': responseMimeType, 'Content-Disposition': make_content_disposition_header(path.encode('utf-8') if type(path) is unicode_string else path)}) + headers.update({'Accept': responseMimeType, 'Content-Disposition': make_content_disposition_header(path.encode('utf-8') if type(path) is str else path)}) with urlOrPath if _is_file_object(urlOrPath) else open(path, 'rb') as f: status, response = callServer('put', serverEndpoint, service, f, headers, verbose, tikaServerJar, config_path=config_path, @@ -375,8 +365,8 @@ def detectLang1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbos ''' path, mode = getRemoteFile(urlOrPath, TikaFilesPath) if option not in services: - log.exception('Language option must be one of %s ' % binary_string(services.keys())) - raise TikaException('Language option must be one of %s ' % binary_string(services.keys())) + log.exception('Language option must be one of %s ' % bytes(services.keys())) + raise TikaException('Language option must be one of %s ' % bytes(services.keys())) service = services[option] status, response = callServer('put', serverEndpoint, service, open(path, 'rb'), {'Accept': responseMimeType}, verbose, tikaServerJar, requestOptions=requestOptions) @@ -471,13 +461,13 @@ def detectType1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbos ''' path, mode = getRemoteFile(urlOrPath, TikaFilesPath) if option not in services: - log.exception('Detect option must be one of %s' % binary_string(services.keys())) - raise TikaException('Detect option must be one of %s' % binary_string(services.keys())) + log.exception('Detect option must be one of %s' % bytes(services.keys())) + raise TikaException('Detect option must be one of %s' % bytes(services.keys())) service = services[option] status, response = callServer('put', serverEndpoint, service, open(path, 'rb'), { 'Accept': responseMimeType, - 'Content-Disposition': make_content_disposition_header(path.encode('utf-8') if type(path) is unicode_string else path) + 'Content-Disposition': make_content_disposition_header(path.encode('utf-8') if type(path) is str else path) }, verbose, tikaServerJar, config_path=config_path, requestOptions=requestOptions) if csvOutput == 1: @@ -533,15 +523,15 @@ def callServer(verb, serverEndpoint, service, data, headers, verbose=Verbose, ti serviceUrl = serverEndpoint + service if verb not in httpVerbs: - log.exception('Tika Server call must be one of %s' % binary_string(httpVerbs.keys())) - raise TikaException('Tika Server call must be one of %s' % binary_string(httpVerbs.keys())) + log.exception('Tika Server call must be one of %s' % bytes(httpVerbs.keys())) + raise TikaException('Tika Server call must be one of %s' % bytes(httpVerbs.keys())) verbFn = httpVerbs[verb] if Windows and hasattr(data, "read"): data = data.read() encodedData = data - if type(data) is unicode_string: + if type(data) is str: encodedData = data.encode('utf-8') requestOptionsDefault = {