diff --git a/.github/workflows/_test-cli.yml b/.github/workflows/_test-cli.yml new file mode 100644 index 000000000..bcb4e823c --- /dev/null +++ b/.github/workflows/_test-cli.yml @@ -0,0 +1,47 @@ +name: CLI Test + +on: + workflow_call: + workflow_dispatch: + +env: + MINDEE_API_KEY: ${{ secrets.MINDEE_API_KEY_SE_TESTS }} + MINDEE_V2_API_KEY: ${{ secrets.MINDEE_V2_SE_TESTS_API_KEY }} + MINDEE_V2_SE_TESTS_FINDOC_MODEL_ID: ${{ secrets.MINDEE_V2_SE_TESTS_FINDOC_MODEL_ID }} + +jobs: + test: + name: Run Tests + timeout-minutes: 30 + strategy: + max-parallel: 4 + matrix: + os_config: + - os: "ubuntu-24.04" + rid: "linux-x64" + - os: "macos-latest" + rid: "osx-x64" + - os: "windows-latest" + rid: "win-x64" + ruby: + - "3.0" + - "4.0" + runs-on: ${{ matrix.os_config.os }} + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + + - name: set up Ruby ${{ matrix.ruby }} + uses: ruby/setup-ruby@v1 + with: + ruby-version: ${{ matrix.ruby }} + bundler-cache: true + + - name: Tests V2 CLI + run: | + ./spec/test_v2_cli.sh ./spec/data/file_types/pdf/blank_1.pdf ${{ matrix.os_config.rid }} + + - name: Tests V1 CLI + run: | + ./spec/test_v1_cli.sh ./spec/data/file_types/pdf/blank_1.pdf ${{ matrix.os_config.rid }} diff --git a/.github/workflows/_test-integrations.yml b/.github/workflows/_test-integrations.yml index 75a72a712..ef025d4e8 100644 --- a/.github/workflows/_test-integrations.yml +++ b/.github/workflows/_test-integrations.yml @@ -7,6 +7,19 @@ on: workflow_call: workflow_dispatch: +env: + MINDEE_API_KEY: ${{ secrets.MINDEE_API_KEY_SE_TESTS }} + WORKFLOW_ID: ${{ secrets.WORKFLOW_ID_SE_TESTS }} + MINDEE_V2_API_KEY: ${{ secrets.MINDEE_V2_SE_TESTS_API_KEY }} + MINDEE_V2_FINDOC_MODEL_ID: ${{ secrets.MINDEE_V2_SE_TESTS_FINDOC_MODEL_ID }} + MINDEE_V2_SE_TESTS_BLANK_PDF_URL: ${{ secrets.MINDEE_V2_SE_TESTS_BLANK_PDF_URL }} + MINDEE_V2_SE_TESTS_FINDOC_MODEL_ID: ${{ secrets.MINDEE_V2_SE_TESTS_FINDOC_MODEL_ID }} + MINDEE_V2_SE_TESTS_CLASSIFICATION_MODEL_ID: ${{ secrets.MINDEE_V2_SE_TESTS_CLASSIFICATION_MODEL_ID }} + MINDEE_V2_SE_TESTS_CROP_MODEL_ID: ${{ secrets.MINDEE_V2_SE_TESTS_CROP_MODEL_ID }} + MINDEE_V2_SE_TESTS_OCR_MODEL_ID: ${{ secrets.MINDEE_V2_SE_TESTS_OCR_MODEL_ID }} + MINDEE_V2_SE_TESTS_SPLIT_MODEL_ID: ${{ secrets.MINDEE_V2_SE_TESTS_SPLIT_MODEL_ID }} + MINDEE_LOG_LEVEL: DEBUG + jobs: integration-tests: name: Run Integration Tests @@ -50,17 +63,6 @@ jobs: sudo sed -i "s/$SRC/$RPL/" /etc/ImageMagick-6/policy.xml - name: Run Rspec for integration tests - env: - MINDEE_API_KEY: ${{ secrets.MINDEE_API_KEY_SE_TESTS }} - WORKFLOW_ID: ${{ secrets.WORKFLOW_ID_SE_TESTS }} - MINDEE_V2_API_KEY: ${{ secrets.MINDEE_V2_SE_TESTS_API_KEY }} - MINDEE_V2_FINDOC_MODEL_ID: ${{ secrets.MINDEE_V2_SE_TESTS_FINDOC_MODEL_ID }} - MINDEE_V2_SE_TESTS_BLANK_PDF_URL: ${{ secrets.MINDEE_V2_SE_TESTS_BLANK_PDF_URL }} - MINDEE_V2_SE_TESTS_CLASSIFICATION_MODEL_ID: ${{ secrets.MINDEE_V2_SE_TESTS_CLASSIFICATION_MODEL_ID }} - MINDEE_V2_SE_TESTS_CROP_MODEL_ID: ${{ secrets.MINDEE_V2_SE_TESTS_CROP_MODEL_ID }} - MINDEE_V2_SE_TESTS_OCR_MODEL_ID: ${{ secrets.MINDEE_V2_SE_TESTS_OCR_MODEL_ID }} - MINDEE_V2_SE_TESTS_SPLIT_MODEL_ID: ${{ secrets.MINDEE_V2_SE_TESTS_SPLIT_MODEL_ID }} - MINDEE_LOG_LEVEL: DEBUG run: | bundle exec rake integration diff --git a/.github/workflows/_test-smoke.yml b/.github/workflows/_test-smoke.yml index fe58901b1..dab35abb0 100644 --- a/.github/workflows/_test-smoke.yml +++ b/.github/workflows/_test-smoke.yml @@ -41,14 +41,15 @@ jobs: ruby-version: ${{ matrix.ruby }} bundler-cache: true - - name: Tests V1 code samples + - name: Tests V2 code samples env: MINDEE_LOG_LEVEL: DEBUG run: | - ./spec/test_code_samples_v1.sh ${{ secrets.MINDEE_ACCOUNT_SE_TESTS }} ${{ secrets.MINDEE_ENDPOINT_SE_TESTS }} + ./spec/test_v2_code_samples.sh - - name: Tests V2 code samples + - name: Tests V1 code samples env: MINDEE_LOG_LEVEL: DEBUG run: | - ./spec/test_code_samples_v2.sh + ./spec/test_v1_code_samples.sh ${{ secrets.MINDEE_ACCOUNT_SE_TESTS }} ${{ secrets.MINDEE_ENDPOINT_SE_TESTS }} + diff --git a/.github/workflows/cron.yml b/.github/workflows/cron.yml index bcfb6214f..ee6755c89 100644 --- a/.github/workflows/cron.yml +++ b/.github/workflows/cron.yml @@ -8,3 +8,6 @@ jobs: test-smoke: uses: mindee/mindee-api-ruby/.github/workflows/_test-smoke.yml@main secrets: inherit + test-cli: + uses: mindee/mindee-api-ruby/.github/workflows/_test-cli.yml@main + secrets: inherit diff --git a/.github/workflows/pull-request.yml b/.github/workflows/pull-request.yml index a4c8e5dd4..656762ad9 100644 --- a/.github/workflows/pull-request.yml +++ b/.github/workflows/pull-request.yml @@ -22,3 +22,7 @@ jobs: uses: ./.github/workflows/_test-smoke.yml needs: test-units secrets: inherit + test-cli: + uses: ./.github/workflows/_test-cli.yml + needs: test-units + secrets: inherit diff --git a/.rubocop.yml b/.rubocop.yml index fbb91a84e..d7fcf91c8 100644 --- a/.rubocop.yml +++ b/.rubocop.yml @@ -37,7 +37,8 @@ Metrics/MethodLength: Metrics/ClassLength: Max: 200 - +Metrics/ModuleLength: + Max: 200 Metrics/ParameterLists: Max: 8 diff --git a/bin/mindee.rb b/bin/mindee.rb index 6c303ef19..f6900dbed 100755 --- a/bin/mindee.rb +++ b/bin/mindee.rb @@ -3,127 +3,28 @@ require 'bundler/setup' require 'optparse' -require 'mindee' - -require_relative 'cli_products' - -options = {} - -# Initializes universal-specific options -# @param cli_parser [OptionParser] -def custom_subcommand(cli_parser, options) - cli_parser.on('-v [VERSION]', '--version [VERSION]', 'Model version for the API') do |v| - options[:endpoint_version] = v - end - cli_parser.on('-a ACCOUNT_NAME', '--account ACCOUNT_NAME', 'API account name for the endpoint') do |v| - options[:account_name] = v - end -end - -product_parser = {} -PRODUCTS.each do |doc_key, doc_value| - product_parser[doc_key] = OptionParser.new do |opts| - opts.on('-w', '--all-words', 'Include words in response') do |v| - options[:all_words] = v - end - opts.on('-c', '--cut-pages', "Cut document pages") do |v| - options[:cut_pages] = v - end - opts.on('-k [KEY]', '--key [KEY]', 'API key for the endpoint') do |v| - options[:api_key] = v - end - opts.on('-f', '--full', "Print the full data, including pages") do |v| - options[:print_full] = true - end - opts.on('-F', '--fix-pdf', "Attempts to fix broken PDF files before sending them to the server.") do |v| - options[:repair_pdf] = true - end - if doc_key != 'universal' - opts.banner = "#{doc_value[:description]}. \nUsage: \nmindee.rb universal [options] endpoint_name file\nor\nmindee.rb universal [options] endpoint_name file" - custom_subcommand(opts, options) - end - if doc_value[:async] - if doc_value[:sync] - opts.on("-A", "--async", "Call asynchronously") do |v| - options[:parse_async] = v - end - end - end +require_relative 'v1/parser' +require_relative 'v2/parser' + +def setup_main_parser + v1_parser = MindeeCLI::V1Parser.new(ARGV) + v2_parser = MindeeCLI::V2Parser.new(ARGV) + main_parser = OptionParser.new do |opts| + opts.banner = "Usage: mindee [command]" + opts.separator "Commands:" + opts.separator " v1 Use Version 1 of the Mindee API" + opts.separator " v2 Use Version 2 of the Mindee API" end -end - -global_parser = OptionParser.new do |opts| - opts.banner = "Usage: mindee.rb product [options] file" - opts.separator "Available products:" - opts.separator " #{PRODUCTS.keys.join("\n ")}" -end - -command = ARGV.shift -unless PRODUCTS.include?(command) - abort(global_parser.help) -end -doc_class = PRODUCTS[command][:doc_class] -product_parser[command].parse! + main_command = ARGV.shift -if command == 'universal' - if ARGV.length < 2 - $stderr.puts "The '#{command}' command requires both ENDPOINT_NAME and file arguments." - abort(product_parser[command].help) - end - endpoint_name = ARGV[0] - options[:file_path] = ARGV[1] -else - if ARGV.length < 1 - $stderr.puts "file missing" - abort(product_parser[command].help) - end - endpoint_name = nil - options[:file_path] = ARGV[0] -end - -mindee_client = Mindee::Client.new(api_key: options[:api_key]) -if options[:file_path].start_with?("https://") - input_source = mindee_client.source_from_url(options[:file_path]) -else - input_source = mindee_client.source_from_path(options[:file_path], repair_pdf: options[:repair_pdf]) -end - -if command == 'universal' - custom_endpoint = mindee_client.create_endpoint( - endpoint_name: endpoint_name, - account_name: options[:account_name], - version: options[:endpoint_version].nil? ? "1" : options[:endpoint_version] - ) -else - custom_endpoint = nil -end - -if options[:cut_pages].nil? || !options[:cut_pages].is_a?(Integer) || options[:cut_pages] < 0 - page_options = nil -else - page_options = Mindee::PageOptions.new(params: { - page_indexes: (0..options[:cut_pages].to_i).to_a, - operation: :KEEP_ONLY, - on_min_pages: 0, - }) -end - -if options[:parse_async].nil? - if !PRODUCTS[command][:sync] - options[:parse_async] = true + case main_command + when 'v1' + v1_parser.execute + when 'v2' + v2_parser.execute else - options[:parse_async] = false + abort(main_parser.help) end end -result = mindee_client.parse( - input_source, - doc_class, - options: { endpoint: custom_endpoint, - options: Mindee::ParseOptions.new(params: { page_options: page_options }), enqueue: options[:parse_async] } -) -if options[:print_full] - puts result.document -else - puts result.document.inference.prediction -end +setup_main_parser diff --git a/bin/v1/parser.rb b/bin/v1/parser.rb new file mode 100644 index 000000000..0be7bd381 --- /dev/null +++ b/bin/v1/parser.rb @@ -0,0 +1,153 @@ +# frozen_string_literal: true + +require 'mindee' +require_relative 'products' + +module MindeeCLI + # Mindee Command Line Interface + # V1 CLI class. + class V1Parser + # @return [Array] + attr_reader :arguments + + # @return [OptionParser] + attr_reader :options_parser + + # @return [Parser] + attr_reader :product_parser + + def initialize(arguments) + @arguments = arguments + @options_parser = OptionParser.new do |opts| + opts.banner = 'Usage: mindee v1 product [options] file' + opts.separator 'Available products:' + opts.separator " #{V1_PRODUCTS.keys.join("\n ")}" + end + @product_parser = init_product_parser + end + + # @param cli_parser [OptionParser] + def custom_subcommand(cli_parser) + cli_parser.on('-v [VERSION]', '--version [VERSION]', 'Model version for the API') do |v| + @options[:endpoint_version] = v + end + cli_parser.on('-a ACCOUNT_NAME', '--account ACCOUNT_NAME', 'API account name for the endpoint') do |v| + @options[:account_name] = v + end + end + + # @return [Hash] + def init_product_parser + v1_product_parser = {} + V1_PRODUCTS.each do |doc_key, doc_value| + v1_product_parser[doc_key] = OptionParser.new do |options_parser| + options_parser.on('-w', '--all-words', 'Include words in response') { |v| @options[:all_words] = v } + options_parser.on('-k [KEY]', '--key [KEY]', 'API key for the endpoint') { |v| @options[:api_key] = v } + options_parser.on('-o FORMAT', '--output-format FORMAT', ['raw', 'full', 'summary'], + 'Format of the output (raw, full, summary). Default: summary') do |format| + @options[:output_format] = format + end + options_parser.on('-F', '--fix-pdf', 'Repair PDF') { @options[:repair_pdf] = true } + + if doc_key != 'universal' + options_parser.banner = "Usage: mindee v1 #{doc_key} [options] file" + custom_subcommand(options_parser) + end + + if doc_value[:async] && doc_value[:sync] + options_parser.on('-A', '--async', 'Call asynchronously') { |v| @options[:parse_async] = v } + end + end + end + v1_product_parser + end + + # @param product_command [String] + # @param endpoint_name [String, nil] + # @param options [Hash] + # @return [Mindee::Parsing::Common::ApiResponse] + def send(product_command, endpoint_name, options) + mindee_client = Mindee::Client.new(api_key: options[:api_key]) + doc_class = V1_PRODUCTS[product_command][:doc_class] + input_source = setup_input_source(mindee_client, options) + custom_endpoint = setup_endpoint(mindee_client, product_command, endpoint_name, options) + options[:parse_async] = !V1_PRODUCTS[product_command][:sync] if options[:parse_async].nil? + + mindee_client.parse( + input_source, + doc_class, + options: { endpoint: custom_endpoint, + enqueue: options[:parse_async] } + ) + end + + def print_result(result, output_format) + if output_format == :raw + puts JSON.pretty_generate(JSON.parse(result.raw_http)) + else + puts output_format == :full ? result.document : result.document.inference.prediction + end + end + + # @param product_command [String] + def execute + @options = { output_format: :summary } + product_command = @arguments.shift + + abort(@options_parser.help) unless V1_PRODUCTS.include?(product_command) + @product_parser[product_command].parse! + + if product_command == 'universal' + if @arguments.length < 2 + warn "The 'universal' command requires both ENDPOINT_NAME and file arguments." + abort(@product_parser[product_command].help) + end + endpoint_name = @arguments[0] + @options[:file_path] = @arguments[1] + else + if @arguments.empty? + warn 'file missing' + abort(@product_parser[product_command].help) + end + endpoint_name = nil + @options[:file_path] = @arguments[0] + end + + result = send(product_command, endpoint_name, @options) + print_result(result, output_format) + end + + private + + # @return [Symbol] + def output_format + @options[:output_format]&.to_sym || :summary + end + + # @param mindee_client [Mindee::V1::Client] + # @param options [Hash] + # @return [Hash] + def setup_input_source(mindee_client, options) + if options[:file_path].start_with?('https://') + mindee_client.source_from_url(options[:file_path]) + else + mindee_client.source_from_path(options[:file_path], repair_pdf: options[:repair_pdf]) + end + end + + # @param mindee_client [Mindee::V1::Client] + # @param product_command [String] + # @param endpoint_name [String, nil] + # @param options [Hash] + # @return [Mindee::HTTP::Endpoint, nil] + def setup_endpoint(mindee_client, product_command, endpoint_name, options) + return unless product_command == 'universal' + + mindee_client.create_endpoint( + endpoint_name: endpoint_name, + account_name: options[:account_name], + version: options[:endpoint_version] || '1' + ) + end + end +end diff --git a/bin/cli_products.rb b/bin/v1/products.rb similarity index 63% rename from bin/cli_products.rb rename to bin/v1/products.rb index 41df18367..96cf465b9 100644 --- a/bin/cli_products.rb +++ b/bin/v1/products.rb @@ -1,166 +1,166 @@ # frozen_string_literal: true -PRODUCTS = { - "universal" => { - description: "Universal document type from API builder", +V1_PRODUCTS = { + 'universal' => { + description: 'Universal document type from API builder', doc_class: Mindee::Product::Universal::Universal, sync: true, async: true, }, - "barcode-reader" => { - description: "Barcode Reader", + 'barcode-reader' => { + description: 'Barcode Reader', doc_class: Mindee::Product::BarcodeReader::BarcodeReaderV1, sync: true, async: false, }, - "bill-of-lading" => { - description: "Bill of Lading", + 'bill-of-lading' => { + description: 'Bill of Lading', doc_class: Mindee::Product::BillOfLading::BillOfLadingV1, sync: false, async: true, }, - "business-card" => { - description: "Business Card", + 'business-card' => { + description: 'Business Card', doc_class: Mindee::Product::BusinessCard::BusinessCardV1, sync: false, async: true, }, - "cropper" => { - description: "Cropper", + 'cropper' => { + description: 'Cropper', doc_class: Mindee::Product::Cropper::CropperV1, sync: true, async: false, }, - "delivery-note" => { - description: "Delivery note", + 'delivery-note' => { + description: 'Delivery note', doc_class: Mindee::Product::DeliveryNote::DeliveryNoteV1, sync: false, async: true, }, - "driver-license" => { - description: "Driver License", + 'driver-license' => { + description: 'Driver License', doc_class: Mindee::Product::DriverLicense::DriverLicenseV1, sync: false, async: true, }, - "financial-document" => { - description: "Financial Document", + 'financial-document' => { + description: 'Financial Document', doc_class: Mindee::Product::FinancialDocument::FinancialDocumentV1, sync: true, async: true, }, - "fr-bank-account-details" => { - description: "Bank Account Details", + 'fr-bank-account-details' => { + description: 'Bank Account Details', doc_class: Mindee::Product::FR::BankAccountDetails::BankAccountDetailsV2, sync: true, async: false, }, - "fr-bank-statement" => { - description: "Bank Statement", + 'fr-bank-statement' => { + description: 'Bank Statement', doc_class: Mindee::Product::FR::BankStatement::BankStatementV2, sync: false, async: true, }, - "fr-carte-grise" => { - description: "Carte Grise", + 'fr-carte-grise' => { + description: 'Carte Grise', doc_class: Mindee::Product::FR::CarteGrise::CarteGriseV1, sync: true, async: false, }, - "fr-energy-bill" => { - description: "Energy Bill", + 'fr-energy-bill' => { + description: 'Energy Bill', doc_class: Mindee::Product::FR::EnergyBill::EnergyBillV1, sync: false, async: true, }, - "fr-health-card" => { - description: "Health Card", + 'fr-health-card' => { + description: 'Health Card', doc_class: Mindee::Product::FR::HealthCard::HealthCardV1, sync: false, async: true, }, - "fr-carte-nationale-d-identite" => { + 'fr-carte-nationale-d-identite' => { description: "Carte Nationale d'Identité", doc_class: Mindee::Product::FR::IdCard::IdCardV2, sync: true, async: false, }, - "fr-payslip" => { - description: "Payslip", + 'fr-payslip' => { + description: 'Payslip', doc_class: Mindee::Product::FR::Payslip::PayslipV3, sync: false, async: true, }, - "ind-passport-india" => { - description: "Passport - India", + 'ind-passport-india' => { + description: 'Passport - India', doc_class: Mindee::Product::IND::IndianPassport::IndianPassportV1, sync: false, async: true, }, - "international-id" => { - description: "International ID", + 'international-id' => { + description: 'International ID', doc_class: Mindee::Product::InternationalId::InternationalIdV2, sync: false, async: true, }, - "invoice" => { - description: "Invoice", + 'invoice' => { + description: 'Invoice', doc_class: Mindee::Product::Invoice::InvoiceV4, sync: true, async: true, }, - "invoice-splitter" => { - description: "Invoice Splitter", + 'invoice-splitter' => { + description: 'Invoice Splitter', doc_class: Mindee::Product::InvoiceSplitter::InvoiceSplitterV1, sync: false, async: true, }, - "multi-receipts-detector" => { - description: "Multi Receipts Detector", + 'multi-receipts-detector' => { + description: 'Multi Receipts Detector', doc_class: Mindee::Product::MultiReceiptsDetector::MultiReceiptsDetectorV1, sync: true, async: false, }, - "nutrition-facts-label" => { - description: "Nutrition Facts Label", + 'nutrition-facts-label' => { + description: 'Nutrition Facts Label', doc_class: Mindee::Product::NutritionFactsLabel::NutritionFactsLabelV1, sync: false, async: true, }, - "passport" => { - description: "Passport", + 'passport' => { + description: 'Passport', doc_class: Mindee::Product::Passport::PassportV1, sync: true, async: false, }, - "receipt" => { - description: "Receipt", + 'receipt' => { + description: 'Receipt', doc_class: Mindee::Product::Receipt::ReceiptV5, sync: true, async: true, }, - "resume" => { - description: "Resume", + 'resume' => { + description: 'Resume', doc_class: Mindee::Product::Resume::ResumeV1, sync: false, async: true, }, - "us-bank-check" => { - description: "Bank Check", + 'us-bank-check' => { + description: 'Bank Check', doc_class: Mindee::Product::US::BankCheck::BankCheckV1, sync: true, async: false, }, - "us-healthcare-card" => { - description: "Healthcare Card", + 'us-healthcare-card' => { + description: 'Healthcare Card', doc_class: Mindee::Product::US::HealthcareCard::HealthcareCardV1, sync: false, async: true, }, - "us-us-mail" => { - description: "US Mail", + 'us-us-mail' => { + description: 'US Mail', doc_class: Mindee::Product::US::UsMail::UsMailV3, sync: false, async: true, }, -} +}.freeze diff --git a/bin/v2/parser.rb b/bin/v2/parser.rb new file mode 100644 index 000000000..343bfab09 --- /dev/null +++ b/bin/v2/parser.rb @@ -0,0 +1,235 @@ +# frozen_string_literal: true + +require 'mindee' +require_relative 'products' + +module MindeeCLI + # Mindee Command Line Interface + # V2 CLI class. + class V2Parser + # @return [Array] + attr_reader :arguments + + # @return [OptionParser] + attr_reader :options_parser + + # @return [Parser] + attr_reader :product_parser + + # @return [Parser] + attr_reader :search_parser + + def initialize(arguments) + @arguments = arguments + @options_parser = OptionParser.new do |opts| + opts.banner = 'Usage: mindee v2 command [options]' + end + @product_parser = init_product_parser + @search_parser = init_search_parser + end + + # Summarize and print the result of the command. + # @param command [String] + def print_result(command) + if command == 'search-models' + @search_parser.parse!(@arguments) + result = search(@options) + summarized_result = output_format == :full ? result.to_s : result.models.to_s + else + @product_parser[command].parse!(@arguments) + @options[:file_path] = @arguments.shift + if @options[:file_path].nil? + warn 'file missing' + abort(@product_parser[command].help) + end + result = send(command, @options) + summarized_result = output_format == :full ? result.inference.to_s : result.inference.result.to_s + end + + if output_format == :raw + puts JSON.pretty_generate(raw_payload(result.raw_http)) + else + puts summarized_result + end + end + + # Executes the command. + # @return [void] + def execute + @options = { output_format: :summary } + command = @arguments.shift + + validate_command!(command) + print_result(command) + rescue OptionParser::InvalidOption, OptionParser::MissingArgument => e + if command == 'search-models' + abort("#{e.message}\n\n#{@search_parser.help}") + else + abort("#{e.message}\n\n#{@product_parser[command].help}") + end + end + + private + + def validate_command!(command) + return if V2_PRODUCTS.include?(command) || command == 'search-models' + + error_msg = "#{@options_parser.help}\nAvailable commands:\n" + error_msg += " #{'search-models'.ljust(50)}Search for available models for this API key\n" + + V2_PRODUCTS.each do |product_key, product_values| + error_msg += " #{product_key.to_s.ljust(50)}#{product_values[:description]}\n" + end + abort(error_msg) + end + + def init_search_parser + OptionParser.new do |options_parser| + options_parser.banner = 'Usage: mindee v2 search-models [options]' + init_common_options(options_parser) + options_parser.on('-n [NAME]', '--name [NAME]', + 'Search for partial matches in model name. Note: case insensitive') do |v| + @options[:model_name] = v + end + options_parser.on('-t [NAME]', '--type [NAME]', + 'Search for EXACT matches in model type. Note: case sensitive') do |v| + @options[:model_type] = v + end + end + end + + def setup_specific_options(options_parser, doc_value) + options_parser.on('-r', '--rag', 'Enable RAG') { @options[:rag] = true } if doc_value.key?(:rag) + if doc_value.key?(:raw_text) + options_parser.on('-R', '--raw-text', 'Enable Raw Text retrieval') do + @options[:raw_text] = true + end + end + if doc_value.key?(:confidence) + options_parser.on('-c', '--confidence', 'Enable confidence scores') do + @options[:confidence] = true + end + end + options_parser.on('-p', '--polygon', 'Enable polygons') { @options[:polygon] = true } if doc_value.key?(:polygon) + if doc_value.key?(:text_context) + options_parser.on('-t [TEXT CONTEXT]', '--text-context [TEXT CONTEXT]', 'Add Text Context') do |v| + @options[:text_context] = v + end + end + return unless doc_value.key?(:data_schema) + + options_parser.on('-d [DATA SCHEMA]', '--data-schema [DATA SCHEMA]', 'Add Data Schema') do |v| + @options[:data_schema] = v + end + end + + # Initialize common options for search and product commands. + # @param options_parser [OptionParser] + def init_common_options(options_parser) + options_parser.on('-k [KEY]', '--key [KEY]', 'API key for the endpoint') { |v| @options[:api_key] = v } + options_parser.on('-o FORMAT', '--output-format FORMAT', ['raw', 'full', 'summary'], + 'Format of the output (raw, full, summary). Default: summary') do |format| + @options[:output_format] = format + end + end + + # @return [Symbol] + def output_format + @options[:output_format]&.to_sym || :summary + end + + # Handles JSON payloads represented either as a string or an already-parsed hash. + # Also tolerates one extra JSON encoding layer. + # @param payload [String, Hash] + # @return [Hash, Array, String] + def raw_payload(payload) + parsed_payload = payload + 2.times do + break unless parsed_payload.is_a?(String) + + parsed_payload = JSON.parse(parsed_payload) + rescue JSON::ParserError + break + end + parsed_payload + end + + # @return [Hash] + def init_product_parser + v2_product_parser = {} + V2_PRODUCTS.each do |product_key, product_values| + v2_product_parser[product_key] = OptionParser.new do |options_parser| + options_parser.banner = "Usage: mindee v2 #{product_key} [options] file" + options_parser.on('-m MODEL_ID', '--model-id MODEL_ID', 'Model ID') { |v| @options[:model_id] = v } + options_parser.on('-a ALIAS', '--alias ALIAS', 'Add a file alias to the response') do |v| + @options[:alias] = v + end + init_common_options(options_parser) + options_parser.on('-F', '--fix-pdf', 'Attempt to repair PDF before enqueueing') do + @options[:repair_pdf] = true + end + setup_specific_options(options_parser, product_values) + end + end + v2_product_parser + end + + # @return [Hash] + def setup_product_params + params = { model_id: @options[:model_id] } + @options.each_pair do |key, value| + params[key] = value if V2_PRODUCTS['extraction'].include?(key) + end + params + end + + # @param product_command [String] + # @param options [Hash] + # @return [Mindee::Parsing::Common::ApiResponse] + def send(product_command, options) + mindee_client = Mindee::ClientV2.new(api_key: options[:api_key]) + response_class = V2_PRODUCTS[product_command][:response_class] + input_source = setup_input_source(options) + params = setup_product_params + + mindee_client.enqueue_and_get_result( + response_class, + input_source, + params + ) + end + + # @param options [Hash] + # @return [Mindee::V2::Parsing::Search::SearchResponse] + def search(options) + mindee_client = Mindee::ClientV2.new(api_key: options[:api_key]) + mindee_client.search_models(options[:model_name], options[:model_type]) + end + + # @param options [Hash] + # @return [Mindee::Input::InputSource] + def setup_input_source(options) + if options[:file_path].start_with?('https://') + Mindee::Input::Source::URLInputSource.new(options[:file_path]) + else + Mindee::Input::Source::PathInputSource.new(options[:file_path], repair_pdf: options[:repair_pdf]) + end + end + + # @param options [Hash] + # @return [Hash, nil] + def setup_page_options(options) + if options[:cut_pages].nil? || !options[:cut_pages].is_a?(Integer) || + options[:cut_pages].negative? + nil + else + + { + page_indexes: (0..options[:cut_pages].to_i).to_a, + operation: :KEEP_ONLY, + on_min_pages: 0, + } + end + end + end +end diff --git a/bin/v2/products.rb b/bin/v2/products.rb new file mode 100644 index 000000000..be9320cd4 --- /dev/null +++ b/bin/v2/products.rb @@ -0,0 +1,34 @@ +# frozen_string_literal: true + +require 'mindee/v2/product' + +# NOTE: keep product names as string instead of symbols due to kebab-case. + +V2_PRODUCTS = { + 'classification' => { + description: 'Classification Utility', + response_class: Mindee::V2::Product::Classification::Classification, + }, + 'extraction' => { + description: 'Extraction Inference', + response_class: Mindee::V2::Product::Extraction::Extraction, + rag: true, + polygon: true, + confidence: true, + raw_text: true, + text_context: true, + data_schema: true, + }, + 'crop' => { + description: 'Crop Utility', + response_class: Mindee::V2::Product::Crop::Crop, + }, + 'ocr' => { + description: 'OCR Utility', + response_class: Mindee::V2::Product::Ocr::Ocr, + }, + 'split' => { + description: 'Split Utility', + response_class: Mindee::V2::Product::Split::Split, + }, +}.freeze diff --git a/lib/mindee.rb b/lib/mindee.rb index d05a94203..a9020a0f4 100644 --- a/lib/mindee.rb +++ b/lib/mindee.rb @@ -90,6 +90,13 @@ module V2 # Product-specific module. module Product end + + # V2 parsing module. + module Parsing + # V2 search module. + module Search + end + end end end diff --git a/lib/mindee/client.rb b/lib/mindee/client.rb index 39ef0dec8..cf21f03ce 100644 --- a/lib/mindee/client.rb +++ b/lib/mindee/client.rb @@ -162,7 +162,7 @@ def parse_sync(input_source, product_class, endpoint, options) options ) - Mindee::Parsing::Common::ApiResponse.new(product_class, prediction, raw_http.to_s) + Mindee::Parsing::Common::ApiResponse.new(product_class, prediction, raw_http) end # Enqueue a document for async parsing @@ -200,7 +200,7 @@ def enqueue(input_source, product_class, endpoint: nil, options: {}) input_source, opts ) - Mindee::Parsing::Common::ApiResponse.new(product_class, prediction, raw_http.to_json) + Mindee::Parsing::Common::ApiResponse.new(product_class, prediction, raw_http) end # Parses a queued document @@ -215,7 +215,7 @@ def parse_queued(job_id, product_class, endpoint: nil) endpoint = initialize_endpoint(product_class) if endpoint.nil? logger.debug("Fetching queued document as '#{endpoint.url_root}'") prediction, raw_http = endpoint.parse_async(job_id) - Mindee::Parsing::Common::ApiResponse.new(product_class, prediction, raw_http.to_json) + Mindee::Parsing::Common::ApiResponse.new(product_class, prediction, raw_http) end # Enqueue a document for async parsing and automatically try to retrieve it diff --git a/lib/mindee/client_v2.rb b/lib/mindee/client_v2.rb index 835d7642d..d591fe08d 100644 --- a/lib/mindee/client_v2.rb +++ b/lib/mindee/client_v2.rb @@ -110,6 +110,16 @@ def enqueue_and_get_result( "Asynchronous parsing request timed out after #{sec_count} seconds" end + # Searches for a list of available models for the given API key. + # @param model_name [String] + # @param model_type [String] + # @return [Mindee::V2::Parsing::Search::SearchResponse] + def search_models(model_name, model_type) + @mindee_api.search_models(model_name, model_type) + end + + private + # If needed, converts the parsing options provided as a hash into a proper BaseParameters subclass object. # @param params [Hash, Class] Params. # @return [BaseParameters] diff --git a/lib/mindee/http/mindee_api_v2.rb b/lib/mindee/http/mindee_api_v2.rb index ba210d6ee..a7dcbffc2 100644 --- a/lib/mindee/http/mindee_api_v2.rb +++ b/lib/mindee/http/mindee_api_v2.rb @@ -56,8 +56,52 @@ def req_get_job(job_id) Mindee::V2::Parsing::JobResponse.new(process_response(response)) end + # Retrieves a list of models. + # @param model_name [String, nil] + # @param model_type [String, nil] + # @return [Mindee::V2::Parsing::Search::SearchResponse] + def search_models(model_name, model_type) + Mindee::V2::Parsing::Search::SearchResponse.new(process_response(req_get_search_models(model_name, model_type))) + end + private + # Retrieves a list of models. + # @param model_name [String, nil] + # @param model_type [String, nil] + # @return [Net::HTTPResponse] + def req_get_search_models(model_name, model_type) + url = "#{@settings.base_url}/v2/search/models" + uri = URI(url) + + query_params = {} + query_params[:name] = model_name if model_name + query_params[:model_type] = model_type if model_type + uri.query = URI.encode_www_form(query_params) unless query_params.empty? + + headers = { + 'Authorization' => @settings.api_key, + 'User-Agent' => @settings.user_agent, + } + req = Net::HTTP::Get.new(uri, headers) + req['Transfer-Encoding'] = 'chunked' + + Net::HTTP.start(uri.hostname, uri.port, use_ssl: true, read_timeout: @settings.request_timeout) do |http| + return http.request(req) + end + raise Mindee::Errors::MindeeError, 'Could not resolve server response.' + end + + # @param resource [String] Resource to check. + # @return [Boolean] + def uri?(resource) + uri = URI.parse(resource) + throw Mindee::Errors::MindeeError, 'HTTP is not supported.' if uri.scheme == 'http' + uri.scheme == 'https' + rescue URI::BadURIError, URI::InvalidURIError + false + end + # Retrieves a queued job. # # @param url [String] @@ -79,16 +123,6 @@ def req_get_result_url(result_class, url) result_class.new(process_response(response)) end - # @param resource [String] Resource to check. - # @return [Boolean] - def uri?(resource) - uri = URI.parse(resource) - throw Mindee::Errors::MindeeError, 'HTTP is not supported.' if uri.scheme == 'http' - uri.scheme == 'https' - rescue URI::BadURIError, URI::InvalidURIError - false - end - # Converts an HTTP response to a parsed response object. # # @param response [Net::HTTPResponse, nil] diff --git a/lib/mindee/logging/logger.rb b/lib/mindee/logging/logger.rb index ec60b5eb6..2bc2c62d3 100644 --- a/lib/mindee/logging/logger.rb +++ b/lib/mindee/logging/logger.rb @@ -5,8 +5,16 @@ module Mindee # Mindee logging module. module Logging - @logger = Logger.new($stdout) log_level = ENV.fetch('MINDEE_LOG_LEVEL', 'WARN') + log_output = ENV.fetch('MINDEE_LOG_OUTPUT', 'stderr') + @logger = if log_output == 'stderr' + Logger.new($stderr) + elsif log_output == 'stdout' + Logger.new($stdout) + else + warn "Invalid MINDEE_LOG_OUTPUT='#{log_output}', defaulting to 'stderr'" + Logger.new($stderr) + end @logger.level = Logger.const_get(log_level) class << self diff --git a/lib/mindee/parsing/common/api_response.rb b/lib/mindee/parsing/common/api_response.rb index 7e9f47349..5689de2e9 100644 --- a/lib/mindee/parsing/common/api_response.rb +++ b/lib/mindee/parsing/common/api_response.rb @@ -39,10 +39,10 @@ class ApiResponse # @param product_class [Mindee::Inference] # @param http_response [Hash] - # @param raw_http [Hash] + # @param raw_http [String] def initialize(product_class, http_response, raw_http) logger.debug('Handling API response') - @raw_http = raw_http.to_s + @raw_http = raw_http raise Errors::MindeeAPIError, 'Invalid response format.' unless http_response.key?('api_request') @api_request = Mindee::Parsing::Common::ApiRequest.new(http_response['api_request']) diff --git a/lib/mindee/pdf/pdf_processor.rb b/lib/mindee/pdf/pdf_processor.rb index 4f37116f4..4796c0e3c 100644 --- a/lib/mindee/pdf/pdf_processor.rb +++ b/lib/mindee/pdf/pdf_processor.rb @@ -61,6 +61,11 @@ def self.indexes_from_remove(page_indexes, all_pages) # @param io_stream [StringIO] # @return [Origami::PDF] def self.open_pdf(io_stream) + unless PDFTools.pdf_header?(io_stream) + raise Origami::InvalidPDFError, + 'Input stream does not contain a PDF header.' + end + pdf_parser = Origami::PDF::LinearParser.new({ verbosity: Origami::Parser::VERBOSE_QUIET }) io_stream.seek(0) pdf_parser.parse(io_stream) diff --git a/lib/mindee/pdf/pdf_tools.rb b/lib/mindee/pdf/pdf_tools.rb index c3e6bb549..e2ee8bf6e 100644 --- a/lib/mindee/pdf/pdf_tools.rb +++ b/lib/mindee/pdf/pdf_tools.rb @@ -53,10 +53,28 @@ def self.stream_has_text?(stream) text_operators.any? { |op| data.include?(op) } end + # Checks whether a stream contains a PDF header near the beginning. + # @param [StringIO] io_stream Binary-encoded stream. + # @param [Integer] maximum_offset Maximum allowed offset to find '%PDF-'. + # @return [bool] `true` when the stream appears to be a PDF. + def self.pdf_header?(io_stream, maximum_offset: 500) + initial_pos = nil + initial_pos = io_stream.pos if io_stream.respond_to?(:pos) + io_stream.seek(0) + io_stream.gets('%PDF-') + !(io_stream.eof? || io_stream.pos > maximum_offset) + rescue TypeError, IOError, SystemCallError + false + ensure + io_stream.seek(initial_pos) if !initial_pos.nil? && io_stream.respond_to?(:seek) + end + # Checks whether the file has source_text. Sends false if the file isn't a PDF. # @param [StringIO] pdf_data Abinary-encoded stream representing the PDF file. # @return [bool] `true` if the pdf has source text, false otherwise. def self.source_text?(pdf_data) + return false unless pdf_header?(pdf_data) + begin pdf_data.rewind pdf = Origami::PDF.read(pdf_data) diff --git a/lib/mindee/v2/parsing.rb b/lib/mindee/v2/parsing.rb index f8784e63a..46fd19e4b 100644 --- a/lib/mindee/v2/parsing.rb +++ b/lib/mindee/v2/parsing.rb @@ -13,3 +13,4 @@ require_relative 'parsing/rag_metadata' require_relative 'parsing/raw_text' require_relative 'parsing/raw_text_page' +require_relative 'parsing/search' diff --git a/lib/mindee/v2/parsing/common_response.rb b/lib/mindee/v2/parsing/common_response.rb index ad631411f..bcdb7223a 100644 --- a/lib/mindee/v2/parsing/common_response.rb +++ b/lib/mindee/v2/parsing/common_response.rb @@ -1,16 +1,18 @@ # frozen_string_literal: true +require 'json' + module Mindee module V2 module Parsing # Base class for inference and job responses on the V2 API. class CommonResponse - # @return [Hash] + # @return [String] attr_reader :raw_http # @param http_response [Hash] def initialize(http_response) - @raw_http = http_response + @raw_http = JSON.generate(http_response) end end end diff --git a/lib/mindee/v2/parsing/search.rb b/lib/mindee/v2/parsing/search.rb new file mode 100644 index 000000000..4972c44e8 --- /dev/null +++ b/lib/mindee/v2/parsing/search.rb @@ -0,0 +1,6 @@ +# frozen_string_literal: true + +require_relative 'search/pagination_metadata' +require_relative 'search/search_model' +require_relative 'search/search_models' +require_relative 'search/search_response' diff --git a/lib/mindee/v2/parsing/search/pagination_metadata.rb b/lib/mindee/v2/parsing/search/pagination_metadata.rb new file mode 100644 index 000000000..207a068a4 --- /dev/null +++ b/lib/mindee/v2/parsing/search/pagination_metadata.rb @@ -0,0 +1,44 @@ +# frozen_string_literal: true + +module Mindee + module V2 + module Parsing + module Search + # Pagination Metadata data associated with model search. + class PaginationMetadata + # @return [Integer] Number of items per page. + attr_reader :per_page + + # @return [Integer] 1-indexed page number. + attr_reader :page + + # @return [Integer] Total items. + attr_reader :total_items + + # @return [Integer] Total number of pages. + attr_reader :total_pages + + # @param raw_response [Hash] The parsed JSON payload mapping to pagination metadata. + def initialize(raw_response) + @per_page = raw_response['per_page'] + @page = raw_response['page'] + @total_items = raw_response['total_items'] + @total_pages = raw_response['total_pages'] + end + + # String representation of the pagination metadata. + # @return [String] + def to_s + [ + ":Per Page: #{@per_page}", + ":Page: #{@page}", + ":Total Items: #{@total_items}", + ":Total Pages: #{@total_pages}", + '', + ].join("\n") + end + end + end + end + end +end diff --git a/lib/mindee/v2/parsing/search/search_model.rb b/lib/mindee/v2/parsing/search/search_model.rb new file mode 100644 index 000000000..ee3ec5edc --- /dev/null +++ b/lib/mindee/v2/parsing/search/search_model.rb @@ -0,0 +1,38 @@ +# frozen_string_literal: true + +module Mindee + module V2 + module Parsing + module Search + # Individual model information. + class SearchModel + # @return [String] ID of the model. + attr_reader :id + + # @return [String] Name of the model. + attr_reader :name + + # @return [String] Type of the model. + attr_reader :model_type + + # @param payload [Hash] The parsed JSON payload mapping to the search model. + def initialize(payload) + @id = payload['id'] + @name = payload['name'] + @model_type = payload['model_type'] + end + + # String representation of the model. + # @return [String] + def to_s + [ + ":Name: #{@name}", + ":ID: #{@id}", + ":Model Type: #{@model_type}", + ].join("\n") + end + end + end + end + end +end diff --git a/lib/mindee/v2/parsing/search/search_models.rb b/lib/mindee/v2/parsing/search/search_models.rb new file mode 100644 index 000000000..9e90f0d62 --- /dev/null +++ b/lib/mindee/v2/parsing/search/search_models.rb @@ -0,0 +1,34 @@ +# frozen_string_literal: true + +module Mindee + module V2 + module Parsing + module Search + # Array of search models. + class SearchModels < Array + def initialize(prediction) + super(prediction.map { |entry| SearchModel.new(entry) }) + end + + # Default string representation. + # @return [String] + def to_s + return "\n" if empty? + + lines = flat_map do |model| + [ + "* :Name: #{model.name}", + " :ID: #{model.id}", + " :Model Type: #{model.model_type}", + ] + end + + # Joins all lines with a newline and appends a final newline + # to perfectly match the C# StringBuilder output. + "#{lines.join("\n")}\n" + end + end + end + end + end +end diff --git a/lib/mindee/v2/parsing/search/search_response.rb b/lib/mindee/v2/parsing/search/search_response.rb new file mode 100644 index 000000000..4ab081d2e --- /dev/null +++ b/lib/mindee/v2/parsing/search/search_response.rb @@ -0,0 +1,36 @@ +# frozen_string_literal: true + +module Mindee + module V2 + module Parsing + module Search + # Models search response. + class SearchResponse < CommonResponse + # @return [Search::Search] Parsed search payload. + attr_reader :models + # @return [Search::Search] Pagination metadata. + attr_reader :pagination_metadata + + def initialize(server_response) + super + + @models = Search::SearchModels.new(server_response['models']) + @pagination_metadata = PaginationMetadata.new(server_response['pagination']) + end + + def to_s + [ + 'Models', + '######', + models.to_s, + 'Pagination Metadata', + '###################', + pagination_metadata.to_s, + '', + ].join("\n") + end + end + end + end + end +end diff --git a/mindee.gemspec b/mindee.gemspec index eeba5d4e2..af6cc1bc9 100644 --- a/mindee.gemspec +++ b/mindee.gemspec @@ -22,7 +22,9 @@ Gem::Specification.new do |spec| `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(.github|spec|features)/}) } end spec.bindir = 'bin' - spec.executables = Dir['bin/*'].map { |f| File.basename(f) }.reject { |f| f == 'cli_products.rb' } + spec.executables = Dir.children('bin') + .select { |f| File.file?(File.join('bin', f)) } + .reject { |f| f == 'products.rb' } spec.require_paths = ['lib'] spec.required_ruby_version = Gem::Requirement.new('>= 3.0') diff --git a/sig/mindee/client_v2.rbs b/sig/mindee/client_v2.rbs index ab69e9e6f..14290e9e9 100644 --- a/sig/mindee/client_v2.rbs +++ b/sig/mindee/client_v2.rbs @@ -11,6 +11,9 @@ module Mindee def get_job: (String job_id) -> V2::Parsing::JobResponse def enqueue: [T] (HTTP::_ProductClass[T] product, Input::Source::LocalInputSource | Input::Source::URLInputSource, Hash[String | Symbol, untyped] | Input::BaseParameters params) -> V2::Parsing::JobResponse def enqueue_and_get_result: [T] (HTTP::_ProductClass[T] product, Input::Source::LocalInputSource | Input::Source::URLInputSource, Hash[String | Symbol, untyped] | Input::BaseParameters params) -> T + + def search_models: (String?, String?) -> Mindee::V2::Parsing::Search::SearchResponse + def validate_async_params: (Integer | Float, Integer | Float, Integer) -> void def normalize_parameters: (singleton(Input::BaseParameters) param_class, Hash[String | Symbol, untyped] | Input::BaseParameters params) -> Input::BaseParameters end diff --git a/sig/mindee/http/mindee_api_v2.rbs b/sig/mindee/http/mindee_api_v2.rbs index ee2a35e3e..83c350b95 100644 --- a/sig/mindee/http/mindee_api_v2.rbs +++ b/sig/mindee/http/mindee_api_v2.rbs @@ -25,12 +25,17 @@ module Mindee def result_req_get: [T] (String, _ProductClass[T] product) -> Net::HTTPResponse def enqueue: (Input::Source::LocalInputSource | Input::Source::URLInputSource, Input::BaseParameters) -> Net::HTTPResponse? + def search_models: (String?, String?) -> Mindee::V2::Parsing::Search::SearchResponse + private def enqueue_form_options: (Array[Array[untyped]], V2::Product::Extraction::Params::ExtractionParameters) -> Array[Array[untyped]] def req_get_job_url: (String) -> V2::Parsing::JobResponse def req_get_result_url: [T] (_ResponseFactory[T] result_class, String url) -> T + + def req_get_search_models: (String?, String?) -> Net::HTTPResponse + def uri?: (String) -> bool end end diff --git a/sig/mindee/parsing/common/api_response.rbs b/sig/mindee/parsing/common/api_response.rbs index 126363bd0..78651d04f 100644 --- a/sig/mindee/parsing/common/api_response.rbs +++ b/sig/mindee/parsing/common/api_response.rbs @@ -15,11 +15,12 @@ module Mindee class ApiResponse + attr_reader document: Parsing::Common::Document? + attr_reader job: Parsing::Common::Job? + attr_reader api_request: Parsing::Common::ApiRequest + attr_reader raw_http: String + def logger: () -> Logger - def document: -> Parsing::Common::Document? - def job: -> Parsing::Common::Job? - def api_request: -> Parsing::Common::ApiRequest? - def raw_http: -> String def initialize: (singleton(Parsing::Common::Inference), Hash[String | Symbol, untyped] | Net::HTTPResponse, String) -> void end end diff --git a/sig/mindee/pdf/pdf_tools.rbs b/sig/mindee/pdf/pdf_tools.rbs index dc8ae8e9d..a456ff4c4 100644 --- a/sig/mindee/pdf/pdf_tools.rbs +++ b/sig/mindee/pdf/pdf_tools.rbs @@ -8,6 +8,8 @@ module Mindee Height: Integer | Float Width: Integer | Float + def self.pdf_header?: (StringIO | File) -> bool + def to_io_stream: (?Hash[Symbol, untyped]) -> StringIO def intents_as_pdfa1: () -> void def delinearize!: () -> void diff --git a/sig/mindee/v2/parsing/common_response.rbs b/sig/mindee/v2/parsing/common_response.rbs index 890ab345a..cb1d6df42 100644 --- a/sig/mindee/v2/parsing/common_response.rbs +++ b/sig/mindee/v2/parsing/common_response.rbs @@ -3,7 +3,7 @@ module Mindee module V2 module Parsing class CommonResponse - attr_reader raw_http: Hash[String | Symbol, untyped] + attr_reader raw_http: String def initialize: (Hash[String | Symbol, untyped]) -> void end end diff --git a/sig/mindee/v2/parsing/search/pagination_metadata.rbs b/sig/mindee/v2/parsing/search/pagination_metadata.rbs new file mode 100644 index 000000000..5c43bb624 --- /dev/null +++ b/sig/mindee/v2/parsing/search/pagination_metadata.rbs @@ -0,0 +1,20 @@ +# lib/mindee/v2/parsing/search/pagination_metadata.rbs + +module Mindee + module V2 + module Parsing + module Search + class PaginationMetadata + attr_reader page: Integer + attr_reader per_page: Integer + attr_reader total_items: Integer + attr_reader total_pages: Integer + + def initialize: (Hash[String|Symbol, untyped]) -> void + + def to_s: -> String + end + end + end + end +end diff --git a/sig/mindee/v2/parsing/search/search_model.rbs b/sig/mindee/v2/parsing/search/search_model.rbs new file mode 100644 index 000000000..e0c02de63 --- /dev/null +++ b/sig/mindee/v2/parsing/search/search_model.rbs @@ -0,0 +1,19 @@ +# lib/mindee/v2/parsing/search/search_model.rb + +module Mindee + module V2 + module Parsing + module Search + class SearchModel + attr_reader id: String + attr_reader model_type: String + attr_reader name: String + + def initialize: (Hash[String|Symbol, untyped]) -> void + + def to_s: -> String + end + end + end + end +end diff --git a/sig/mindee/v2/parsing/search/search_response.rbs b/sig/mindee/v2/parsing/search/search_response.rbs new file mode 100644 index 000000000..14c0bfdae --- /dev/null +++ b/sig/mindee/v2/parsing/search/search_response.rbs @@ -0,0 +1,17 @@ +# lib/mindee/v2/search/search_response.rb +module Mindee + module V2 + module Parsing + module Search + class SearchResponse + attr_reader models: Array[SearchResponse] + attr_reader pagination_metadata: PaginationMetadata + + def initialize: (Hash[String|Symbol, untyped]) -> void + + def to_s: -> String + end + end + end + end +end diff --git a/sig/mindee/v2/parsing/search_models.rbs b/sig/mindee/v2/parsing/search_models.rbs new file mode 100644 index 000000000..ba9d391e4 --- /dev/null +++ b/sig/mindee/v2/parsing/search_models.rbs @@ -0,0 +1,14 @@ +# lib/mindee/v2/parsing/search/search_models.rb +module Mindee + module V2 + module Parsing + module Search + class SearchModels < Array[SearchModel] + def initialize: (Array[untyped]) -> void + + def to_s: -> String + end + end + end + end +end diff --git a/sig/mindee/v2/search/search_response.rbs b/sig/mindee/v2/search/search_response.rbs new file mode 100644 index 000000000..e69de29bb diff --git a/spec/bin/cli_integration.rb b/spec/bin/cli_integration.rb new file mode 100644 index 000000000..ede5c4af6 --- /dev/null +++ b/spec/bin/cli_integration.rb @@ -0,0 +1,91 @@ +# frozen_string_literal: true + +require 'open3' +require 'rbconfig' +require_relative '../data' + +describe 'Mindee CLI V2', :integration, :v2, order: :defined do + let(:findoc_model_id) { ENV.fetch('MINDEE_V2_SE_TESTS_FINDOC_MODEL_ID') } + let(:classification_model_id) { ENV.fetch('MINDEE_V2_SE_TESTS_CLASSIFICATION_MODEL_ID') } + let(:crop_model_id) { ENV.fetch('MINDEE_V2_SE_TESTS_CROP_MODEL_ID') } + let(:ocr_model_id) { ENV.fetch('MINDEE_V2_SE_TESTS_OCR_MODEL_ID') } + let(:split_model_id) { ENV.fetch('MINDEE_V2_SE_TESTS_SPLIT_MODEL_ID') } + let(:blank_pdf_url) { ENV.fetch('MINDEE_V2_SE_TESTS_BLANK_PDF_URL') } + let(:test_file) { File.join(FILE_TYPES_DIR, 'pdf', 'blank_1.pdf') } + let(:cli_path) { File.expand_path('../../bin/mindee.rb', __dir__) } + + def run_cli(*args) + Open3.capture3(RbConfig.ruby, cli_path, *args) + end + + context 'search-models command' do + ['classification', 'crop', 'extraction', 'ocr', 'split'].each do |model_type| + it "returns model list for type #{model_type}" do + stdout, stderr, status = run_cli('v2', 'search-models', '-t', model_type) + expect(status.success?).to eq(true), stderr + expect(stdout.strip).not_to be_empty + end + end + + it 'returns no models for non-existent name' do + stdout, stderr, status = run_cli('v2', 'search-models', '-n', 'supercalifragilisticexpialidocious') + expect(status.success?).to eq(true), stderr + expect(stdout.strip).to eq('') + end + + it 'returns models for name filter' do + stdout, stderr, status = run_cli('v2', 'search-models', '-n', 'findoc') + expect(status.success?).to eq(true), stderr + expect(stdout.strip).not_to be_empty + end + + it 'returns models for name and model_type filters' do + stdout, stderr, status = run_cli('v2', 'search-models', '-n', 'findoc', '-t', 'extraction') + expect(status.success?).to eq(true), stderr + expect(stdout.strip).not_to be_empty + end + + it 'returns HTTP 422 on invalid model type' do + stdout, stderr, status = run_cli('v2', 'search-models', '-t', 'invalid') + expect(status.success?).to eq(false) + expect("#{stdout}\n#{stderr}").to include('HTTP 422') + end + end + + context 'product commands' do + it 'runs extraction from an URL source' do + stdout, stderr, status = run_cli('v2', 'extraction', '-m', findoc_model_id, blank_pdf_url) + expect(status.success?).to eq(true), stderr + expect(stdout.strip).not_to be_empty + end + + { + 'classification' => -> { classification_model_id }, + 'crop' => -> { crop_model_id }, + 'ocr' => -> { ocr_model_id }, + 'split' => -> { split_model_id }, + }.each do |command, model_id_proc| + it "runs #{command} with default args" do + stdout, stderr, status = run_cli('v2', command, '-m', instance_exec(&model_id_proc), test_file) + expect(status.success?).to eq(true), stderr + expect(stdout.strip).not_to be_empty + end + end + end + + context 'extraction options' do + [ + ['-a', 'toto'], + ['-r'], + ['-c'], + ['-p'], + ['-t', 'toto'], + ].each do |option_args| + it "runs extraction with #{option_args.join(' ')}" do + stdout, stderr, status = run_cli('v2', 'extraction', '-m', findoc_model_id, test_file, *option_args) + expect(status.success?).to eq(true), stderr + expect(stdout.strip).not_to be_empty + end + end + end +end diff --git a/spec/pdf/pdf_compressor_spec.rb b/spec/pdf/pdf_compressor_spec.rb index 13f6ab588..e9b50adb8 100644 --- a/spec/pdf/pdf_compressor_spec.rb +++ b/spec/pdf/pdf_compressor_spec.rb @@ -20,6 +20,7 @@ it 'should not detect text pdf in an image file.' do image_input = Mindee::Input::Source::PathInputSource.new("#{FILE_TYPES_DIR}/receipt.jpg") + expect(Origami::PDF).not_to receive(:read) expect(Mindee::PDF::PDFTools.source_text?(image_input.io_stream)).to be(false) end end diff --git a/spec/test_v1_cli.sh b/spec/test_v1_cli.sh new file mode 100755 index 000000000..166d9e7d7 --- /dev/null +++ b/spec/test_v1_cli.sh @@ -0,0 +1,56 @@ +#!/bin/sh +set -e + +# Initialize rbenv +export PATH="$HOME/.rbenv/bin:$PATH" +eval "$($HOME/.rbenv/bin/rbenv init -)" + +TEST_FILE=$1 +RID=$2 + +if [ -z "$TEST_FILE" ]; then + echo "Error: no sample file provided" + exit 1 +fi + +if [ -z "$RID" ]; then + OS_NAME="$(uname -s)" + case "$OS_NAME" in + Linux*) RID="linux-x64" ;; + Darwin*) RID="osx-x64" ;; + CYGWIN*|MINGW*|MSYS*) RID="win-x64" ;; + *) + echo "" + echo "Error: Could not determine default Runtime Identifier (RID) for OS type '$OS_NAME'." + echo "Please provide one manually. Available: 'linux-x64', 'osx-x64', 'win-x64'" + exit 1 + ;; + esac + echo "Warning: Runtime Identifier (RID) not provided, defaulting to $RID" +fi + +WD="$(basename "$PWD")" +if [ "$WD" = "spec" ]; then + CLI_PATH="../bin/mindee.rb" +else + CLI_PATH="./bin/mindee.rb" +fi + +if [ "$RID" = "win-x64" ]; then + CLI_PATH="${CLI_PATH}.exe" +fi + +PRODUCTS="financial-document receipt invoice invoice-splitter" +PRODUCTS_SIZE=4 +i=1 + +for product in $PRODUCTS +do + echo "--- Test $product with Summary Output ($i/$PRODUCTS_SIZE) ---" + SUMMARY_OUTPUT=$(ruby "$CLI_PATH" v1 "$product" "$TEST_FILE") + echo "$SUMMARY_OUTPUT" + echo "" + echo "" + sleep 0.5 + i=$((i + 1)) +done diff --git a/spec/test_code_samples_v1.sh b/spec/test_v1_code_samples.sh similarity index 100% rename from spec/test_code_samples_v1.sh rename to spec/test_v1_code_samples.sh diff --git a/spec/test_v2_cli.sh b/spec/test_v2_cli.sh new file mode 100755 index 000000000..89c4b406e --- /dev/null +++ b/spec/test_v2_cli.sh @@ -0,0 +1,54 @@ +#!/bin/sh +set -e + +TEST_FILE=$1 +RID=$2 + +export PATH="$HOME/.rbenv/bin:$PATH" +eval "$($HOME/.rbenv/bin/rbenv init -)" + +if [ -z "$TEST_FILE" ]; then + echo "Error: no sample file provided" + exit 1 +fi + +if [ -z "$RID" ]; then + OS_NAME="$(uname -s)" + case "$OS_NAME" in + Linux*) RID="linux-x64" ;; + Darwin*) RID="osx-x64" ;; + CYGWIN*|MINGW*|MSYS*) RID="win-x64" ;; + *) + echo "" + echo "Error: Could not determine default Runtime Identifier (RID) for OS type '$OS_NAME'." + echo "Please provide one manually. Available: 'linux-x64', 'osx-x64', 'win-x64'" + exit 1 + ;; + esac + echo "Warning: Runtime Identifier (RID) not provided, defaulting to $RID" +fi + +WD="$(basename "$PWD")" +if [ "$WD" = "spec" ]; then + CLI_PATH="../bin/mindee.rb" +else + CLI_PATH="./bin/mindee.rb" +fi + +echo "--- Test model list retrieval (all models)" +MODELS=$("$CLI_PATH" v2 search-models) +if [ -z "$MODELS" ]; then + echo "Error: no models found" + exit 1 +else + echo "Models retrieval OK" +fi + +echo "--- Test extraction with no additional args" +SUMMARY_OUTPUT=$("$CLI_PATH" v2 extraction -m "$MINDEE_V2_SE_TESTS_FINDOC_MODEL_ID" "$TEST_FILE") +if [ -z "$SUMMARY_OUTPUT" ]; then + echo "Error: no extraction output" + exit 1 +else + echo "Extraction retrieval OK" +fi diff --git a/spec/test_code_samples_v2.sh b/spec/test_v2_code_samples.sh similarity index 100% rename from spec/test_code_samples_v2.sh rename to spec/test_v2_code_samples.sh diff --git a/spec/v1/api_response_spec.rb b/spec/v1/api_response_spec.rb index ef744bb60..b59efc682 100644 --- a/spec/v1/api_response_spec.rb +++ b/spec/v1/api_response_spec.rb @@ -8,12 +8,13 @@ context 'An Invoice document' do it 'should be properly created from an ApiResponse' do response = load_json(V1_PRODUCT_DATA_DIR, 'invoices/response_v4/complete.json') + raw_response = JSON.generate(response) rst_response = read_file(V1_PRODUCT_DATA_DIR, 'invoices/response_v4/summary_full.rst') parsed_response = Mindee::Parsing::Common::ApiResponse.new(Mindee::Product::Invoice::InvoiceV4, - response, response.to_s) + response, raw_response) expect(parsed_response.document.inference).to be_a Mindee::Product::Invoice::InvoiceV4 expect(parsed_response.document.inference.prediction).to be_a Mindee::Product::Invoice::InvoiceV4Document - expect(parsed_response.raw_http).to eq(response.to_s) + expect(parsed_response.raw_http).to eq(raw_response) expect(parsed_response.document.n_pages).to eq(1) expect(parsed_response.document.inference.pages.length).to eq(1) expect(parsed_response.document.to_s).to eq(rst_response.to_s) diff --git a/spec/v1/async_rseponse_spec.rb b/spec/v1/async_rseponse_spec.rb index 9a264a57f..9a20af42a 100644 --- a/spec/v1/async_rseponse_spec.rb +++ b/spec/v1/async_rseponse_spec.rb @@ -13,14 +13,14 @@ JSON.generate(response)) expect(Mindee::HTTP::ResponseValidation.valid_async_response?(fake_response)).to eq(true) parsed_response = Mindee::Parsing::Common::ApiResponse.new(Mindee::Product::InvoiceSplitter::InvoiceSplitterV1, - response, response.to_json) + response, fake_response.body) expect(parsed_response.job.status).to eq(Mindee::Parsing::Common::JobStatus::WAITING) expect(parsed_response.job.id).to eq('76c90710-3a1b-4b91-8a39-31a6543e347c') expect(parsed_response.job.status).to_not respond_to(:available_at) expect(parsed_response.job.status).to_not respond_to(:millisecs_taken) expect(parsed_response.api_request.error).to eq({}) - expect(parsed_response.raw_http).to eq(response.to_json) + expect(parsed_response.raw_http).to eq(fake_response.body) end it 'should not be able to be sent on incompatible endpoints' do @@ -29,7 +29,7 @@ JSON.generate(response)) expect(Mindee::HTTP::ResponseValidation.valid_async_response?(fake_response)).to eq(false) parsed_response = Mindee::Parsing::Common::ApiResponse.new(Mindee::Product::InvoiceSplitter::InvoiceSplitterV1, - response, response.to_json) + response, fake_response.body) expect(parsed_response.job).to be(nil) end @@ -39,14 +39,14 @@ JSON.generate(response)) expect(Mindee::HTTP::ResponseValidation.valid_async_response?(fake_response)).to eq(true) parsed_response = Mindee::Parsing::Common::ApiResponse.new(Mindee::Product::InvoiceSplitter::InvoiceSplitterV1, - response, response.to_json) + response, fake_response.body) expect(parsed_response.job.issued_at.strftime('%Y-%m-%dT%H:%M:%S.%6N')).to eq('2023-03-16T12:33:49.602947') expect(parsed_response.job.status).to eq(Mindee::Parsing::Common::JobStatus::PROCESSING) expect(parsed_response.job.id).to eq('76c90710-3a1b-4b91-8a39-31a6543e347c') expect(parsed_response.job.status).to_not respond_to(:available_at) expect(parsed_response.job.status).to_not respond_to(:millisecs_taken) expect(parsed_response.api_request.error['code']).to eq(nil) - expect(parsed_response.raw_http).to eq(response.to_json) + expect(parsed_response.raw_http).to eq(fake_response.body) end it 'should be able to poll a completed queue' do @@ -55,7 +55,7 @@ JSON.generate(response)) expect(Mindee::HTTP::ResponseValidation.valid_async_response?(fake_response)).to eq(true) parsed_response = Mindee::Parsing::Common::ApiResponse.new(Mindee::Product::InvoiceSplitter::InvoiceSplitterV1, - response, response.to_json) + response, fake_response.body) expect(parsed_response.job.issued_at.strftime('%Y-%m-%dT%H:%M:%S.%6N')).to eq('2023-03-21T13:52:56.326107') expect(parsed_response.job.status).to eq(Mindee::Parsing::Common::JobStatus::COMPLETED) expect(parsed_response.job.id).to eq('b6caf9e8-9bcc-4412-bcb7-f5b416678f0d') @@ -63,7 +63,7 @@ expect(parsed_response.job.millisecs_taken).to eq(4664) expect(parsed_response.document).to_not be(nil) expect(parsed_response.api_request.error['code']).to eq(nil) - expect(parsed_response.raw_http).to eq(response.to_json) + expect(parsed_response.raw_http).to eq(fake_response.body) end it 'should retrieve a failed job' do @@ -72,7 +72,7 @@ JSON.generate(response)) expect(Mindee::HTTP::ResponseValidation.valid_async_response?(fake_response)).to eq(false) parsed_response = Mindee::Parsing::Common::ApiResponse.new(Mindee::Product::InvoiceSplitter::InvoiceSplitterV1, - response, response.to_json) + response, fake_response.body) expect(parsed_response.job.issued_at.strftime('%Y-%m-%dT%H:%M:%S.%6N')).to eq('2024-02-20T10:31:06.878599') expect(parsed_response.job.available_at.strftime('%Y-%m-%dT%H:%M:%S.%6N')).to eq('2024-02-20T10:31:06.878599') expect(parsed_response.api_request.status).to eq(Mindee::Parsing::Common::RequestStatus::SUCCESS) diff --git a/spec/v2/client_v2_spec.rb b/spec/v2/client_v2_spec.rb index 19da06054..46f532239 100644 --- a/spec/v2/client_v2_spec.rb +++ b/spec/v2/client_v2_spec.rb @@ -92,6 +92,7 @@ def stub_next_request_with(method, hash:, status_code: 0) resp = client.get_job('123e4567-e89b-12d3-a456-426614174000') expect(resp).to be_a(Mindee::V2::Parsing::JobResponse) + expect(resp.raw_http).to eq(JSON.generate(JSON.parse(parsed))) expect(resp.job.status).to eq('Processing') expect( resp.job.created_at.strftime('%Y-%m-%dT%H:%M:%S.%6N') @@ -106,6 +107,7 @@ def stub_next_request_with(method, hash:, status_code: 0) resp = client.get_job('123e4567-e89b-12d3-a456-426614174000') expect(resp).to be_a(Mindee::V2::Parsing::JobResponse) + expect(resp.raw_http).to eq(JSON.generate(JSON.parse(parsed))) expect(resp.job.status).to eq('Processed') expect(resp.job.model_id).to eq('87654321-4321-4321-4321-CBA987654321') expect(resp.job.filename).to eq('default_sample.jpg') diff --git a/spec/v2/parser_spec.rb b/spec/v2/parser_spec.rb new file mode 100644 index 000000000..97d2dc812 --- /dev/null +++ b/spec/v2/parser_spec.rb @@ -0,0 +1,24 @@ +# frozen_string_literal: true + +require 'json' +require_relative '../../bin/v2/parser' + +describe MindeeCLI::V2Parser do + subject(:parser) { described_class.new([]) } + + it 'keeps already parsed raw payloads unchanged' do + payload = { 'api_request' => { 'status' => 'success' } } + expect(parser.__send__(:raw_payload, payload)).to eq(payload) + end + + it 'parses JSON string payloads' do + payload = { 'api_request' => { 'status' => 'success' } } + expect(parser.__send__(:raw_payload, JSON.generate(payload))).to eq(payload) + end + + it 'parses double-encoded JSON string payloads once more' do + payload = { 'api_request' => { 'status' => 'success' } } + double_encoded = JSON.generate(JSON.generate(payload)) + expect(parser.__send__(:raw_payload, double_encoded)).to eq(payload) + end +end diff --git a/spec/v2/parsing/common_response_spec.rb b/spec/v2/parsing/common_response_spec.rb new file mode 100644 index 000000000..530c4dc23 --- /dev/null +++ b/spec/v2/parsing/common_response_spec.rb @@ -0,0 +1,18 @@ +# frozen_string_literal: true + +require 'json' +require 'mindee' +require 'mindee/v2/parsing/common_response' + +describe Mindee::V2::Parsing::CommonResponse do + it 'stores raw_http as a JSON string' do + server_response = { + 'api_request' => { 'status' => 'success' }, + 'job' => { 'status' => 'Processing' }, + } + + response = described_class.new(server_response) + expect(response.raw_http).to be_a(String) + expect(JSON.parse(response.raw_http)).to eq(server_response) + end +end