Skip to content

Commit 35762a2

Browse files
♻️ 💥 make logging configurable and default output to stderr
1 parent 3907387 commit 35762a2

7 files changed

Lines changed: 38 additions & 4 deletions

File tree

.rubocop.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,8 @@ Metrics/MethodLength:
3737

3838
Metrics/ClassLength:
3939
Max: 200
40-
40+
Metrics/ModuleLength:
41+
Max: 200
4142
Metrics/ParameterLists:
4243
Max: 8
4344

lib/mindee/logging/logger.rb

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,16 @@
55
module Mindee
66
# Mindee logging module.
77
module Logging
8-
@logger = Logger.new($stdout)
98
log_level = ENV.fetch('MINDEE_LOG_LEVEL', 'WARN')
9+
log_output = ENV.fetch('MINDEE_LOG_OUTPUT', 'stderr')
10+
@logger = if log_output == 'stderr'
11+
Logger.new($stderr)
12+
elsif log_output == 'stdout'
13+
Logger.new($stdout)
14+
else
15+
warn "Invalid MINDEE_LOG_OUTPUT='#{log_output}', defaulting to 'stderr'"
16+
Logger.new($stderr)
17+
end
1018
@logger.level = Logger.const_get(log_level)
1119

1220
class << self

lib/mindee/pdf/pdf_processor.rb

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,11 @@ def self.indexes_from_remove(page_indexes, all_pages)
6161
# @param io_stream [StringIO]
6262
# @return [Origami::PDF]
6363
def self.open_pdf(io_stream)
64+
unless PDFTools.pdf_header?(io_stream)
65+
raise Origami::InvalidPDFError,
66+
'Input stream does not contain a PDF header.'
67+
end
68+
6469
pdf_parser = Origami::PDF::LinearParser.new({ verbosity: Origami::Parser::VERBOSE_QUIET })
6570
io_stream.seek(0)
6671
pdf_parser.parse(io_stream)

lib/mindee/pdf/pdf_tools.rb

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,10 +53,28 @@ def self.stream_has_text?(stream)
5353
text_operators.any? { |op| data.include?(op) }
5454
end
5555

56+
# Checks whether a stream contains a PDF header near the beginning.
57+
# @param [StringIO] io_stream Binary-encoded stream.
58+
# @param [Integer] maximum_offset Maximum allowed offset to find '%PDF-'.
59+
# @return [bool] `true` when the stream appears to be a PDF.
60+
def self.pdf_header?(io_stream, maximum_offset: 500)
61+
initial_pos = nil
62+
initial_pos = io_stream.pos if io_stream.respond_to?(:pos)
63+
io_stream.seek(0)
64+
io_stream.gets('%PDF-')
65+
!(io_stream.eof? || io_stream.pos > maximum_offset)
66+
rescue TypeError, IOError, SystemCallError
67+
false
68+
ensure
69+
io_stream.seek(initial_pos) if !initial_pos.nil? && io_stream.respond_to?(:seek)
70+
end
71+
5672
# Checks whether the file has source_text. Sends false if the file isn't a PDF.
5773
# @param [StringIO] pdf_data Abinary-encoded stream representing the PDF file.
5874
# @return [bool] `true` if the pdf has source text, false otherwise.
5975
def self.source_text?(pdf_data)
76+
return false unless pdf_header?(pdf_data)
77+
6078
begin
6179
pdf_data.rewind
6280
pdf = Origami::PDF.read(pdf_data)

sig/mindee/pdf/pdf_tools.rbs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ module Mindee
88
Height: Integer | Float
99
Width: Integer | Float
1010

11+
def self.pdf_header?: (StringIO | File) -> bool
12+
1113
def to_io_stream: (?Hash[Symbol, untyped]) -> StringIO
1214
def intents_as_pdfa1: () -> void
1315
def delinearize!: () -> void

spec/bin/cli_integration.rb

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,7 @@ def run_cli(*args)
3030
it 'returns no models for non-existent name' do
3131
stdout, stderr, status = run_cli('v2', 'search-models', '-n', 'supercalifragilisticexpialidocious')
3232
expect(status.success?).to eq(true), stderr
33-
expect(['D, [2026-03-24T13:35:16.393819 #3543] DEBUG -- : API key set from environment',
34-
'']).to include(stdout.strip)
33+
expect(stdout.strip).to eq('')
3534
end
3635

3736
it 'returns models for name filter' do

spec/pdf/pdf_compressor_spec.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020

2121
it 'should not detect text pdf in an image file.' do
2222
image_input = Mindee::Input::Source::PathInputSource.new("#{FILE_TYPES_DIR}/receipt.jpg")
23+
expect(Origami::PDF).not_to receive(:read)
2324
expect(Mindee::PDF::PDFTools.source_text?(image_input.io_stream)).to be(false)
2425
end
2526
end

0 commit comments

Comments
 (0)