Skip to content

Commit 5f4e2ce

Browse files
authored
Merge pull request #104 from kermitt2/feature/cli-converters
Add standalone CLI converters
2 parents 700d1e8 + a2b7e5f commit 5f4e2ce

7 files changed

Lines changed: 3357 additions & 0 deletions

File tree

Readme.md

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,7 @@ client.process(
224224
markdown_output=True
225225
)
226226

227+
```python
227228
# Process citation lists
228229
client.process(
229230
service="processCitationList",
@@ -232,6 +233,32 @@ client.process(
232233
)
233234
```
234235

236+
### Standalone Conversion Tools
237+
238+
The library includes standalone scripts to convert TEI XML files to other formats without using the main client or server.
239+
240+
#### TEI to JSON Converter
241+
242+
Converts TEI XML files to the structured JSON format (similar to `--json` option).
243+
244+
```bash
245+
# Convert a single file
246+
python -m grobid_client.format.TEI2LossyJSON_cli --input path/to/file.tei.xml --output path/to/output.json
247+
248+
# Convert with verbose logging
249+
python -m grobid_client.format.TEI2LossyJSON_cli --input path/to/file.tei.xml --verbose
250+
```
251+
252+
#### TEI to Markdown Converter
253+
254+
Converts TEI XML files to Markdown format (similar to `--markdown` option).
255+
256+
```bash
257+
# Convert a single file
258+
python -m grobid_client.format.TEI2Markdown_cli --input path/to/file.tei.xml --output path/to/output.md
259+
```
260+
261+
235262
## ⚙️ Configuration
236263

237264
Configuration can be provided via a JSON file. When using the CLI, the `--server` argument overrides the config file
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Standalone CLI for TEI2LossyJSON converter.
4+
5+
This script provides a command-line interface for converting TEI XML files to JSON format
6+
using the TEI2LossyJSONConverter.
7+
"""
8+
import argparse
9+
import json
10+
import logging
11+
import sys
12+
from pathlib import Path
13+
14+
from .TEI2LossyJSON import TEI2LossyJSONConverter
15+
16+
17+
def setup_logging(verbose: bool = False):
18+
"""Setup logging configuration."""
19+
level = logging.INFO if verbose else logging.WARNING
20+
logging.basicConfig(
21+
level=level,
22+
format='%(asctime)s - %(levelname)s - %(message)s',
23+
datefmt='%Y-%m-%d %H:%M:%S'
24+
)
25+
26+
27+
def convert_single_file(input_file: Path, output_file: Path, verbose: bool = False) -> bool:
28+
"""Convert a single TEI file to JSON format."""
29+
try:
30+
if verbose:
31+
logging.info(f"Converting {input_file} to {output_file}")
32+
33+
converter = TEI2LossyJSONConverter()
34+
result = converter.convert_tei_file(input_file, stream=False)
35+
36+
if result is None:
37+
logging.error(f"Failed to convert {input_file}: TEI file is not well-formed or empty")
38+
return False
39+
40+
# Ensure output directory exists
41+
output_file.parent.mkdir(parents=True, exist_ok=True)
42+
43+
# Write JSON output
44+
with open(output_file, 'w', encoding='utf-8') as f:
45+
json.dump(result, f, indent=2, ensure_ascii=False)
46+
47+
if verbose:
48+
logging.info(f"Successfully converted {input_file} to {output_file}")
49+
50+
return True
51+
52+
except Exception as e:
53+
logging.error(f"Error converting {input_file}: {str(e)}")
54+
return False
55+
56+
57+
def main():
58+
"""Main CLI entry point."""
59+
parser = argparse.ArgumentParser(
60+
description="Convert TEI XML files to JSON format using TEI2LossyJSON converter",
61+
formatter_class=argparse.RawDescriptionHelpFormatter,
62+
epilog="""
63+
Examples:
64+
# Convert a single TEI file
65+
python -m grobid_client.format.TEI2LossyJSON --input input.tei.xml --output output.json
66+
67+
# Convert with verbose logging
68+
python -m grobid_client.format.TEI2LossyJSON --input input.tei.xml --output output.json --verbose
69+
70+
# Convert and output to stdout
71+
python -m grobid_client.format.TEI2LossyJSON --input input.tei.xml
72+
"""
73+
)
74+
75+
parser.add_argument(
76+
"--input", "-i",
77+
type=Path,
78+
required=True,
79+
help="Input TEI XML file to convert"
80+
)
81+
82+
parser.add_argument(
83+
"--output", "-o",
84+
type=Path,
85+
help="Output JSON file (if not specified, prints to stdout)"
86+
)
87+
88+
parser.add_argument(
89+
"--verbose", "-v",
90+
action="store_true",
91+
help="Enable verbose logging"
92+
)
93+
94+
args = parser.parse_args()
95+
96+
# Setup logging
97+
setup_logging(args.verbose)
98+
99+
# Validate input file
100+
if not args.input.exists():
101+
logging.error(f"Input file does not exist: {args.input}")
102+
sys.exit(1)
103+
104+
if not args.input.is_file():
105+
logging.error(f"Input path is not a file: {args.input}")
106+
sys.exit(1)
107+
108+
# Convert the file
109+
if args.output:
110+
success = convert_single_file(args.input, args.output, args.verbose)
111+
sys.exit(0 if success else 1)
112+
else:
113+
# Output to stdout
114+
try:
115+
converter = TEI2LossyJSONConverter()
116+
result = converter.convert_tei_file(args.input, stream=False)
117+
118+
if result is None:
119+
logging.error(f"Failed to convert {args.input}: TEI file is not well-formed or empty")
120+
sys.exit(1)
121+
122+
# Print JSON to stdout
123+
print(json.dumps(result, indent=2, ensure_ascii=False))
124+
125+
except Exception as e:
126+
logging.error(f"Error converting {args.input}: {str(e)}")
127+
sys.exit(1)
128+
129+
130+
if __name__ == "__main__":
131+
main()
Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Standalone CLI for TEI2Markdown converter.
4+
5+
This script provides a command-line interface for converting TEI XML files to Markdown format
6+
using the TEI2MarkdownConverter.
7+
"""
8+
import argparse
9+
import logging
10+
import sys
11+
from pathlib import Path
12+
13+
from .TEI2Markdown import TEI2MarkdownConverter
14+
15+
16+
def setup_logging(verbose: bool = False):
17+
"""Setup logging configuration."""
18+
level = logging.INFO if verbose else logging.WARNING
19+
logging.basicConfig(
20+
level=level,
21+
format='%(asctime)s - %(levelname)s - %(message)s',
22+
datefmt='%Y-%m-%d %H:%M:%S'
23+
)
24+
25+
26+
def convert_single_file(input_file: Path, output_file: Path, verbose: bool = False) -> bool:
27+
"""Convert a single TEI file to Markdown format."""
28+
try:
29+
if verbose:
30+
logging.info(f"Converting {input_file} to {output_file}")
31+
32+
converter = TEI2MarkdownConverter()
33+
result = converter.convert_tei_file(input_file)
34+
35+
if result is None:
36+
logging.error(f"Failed to convert {input_file}: TEI file is not well-formed or empty")
37+
return False
38+
39+
# Ensure output directory exists
40+
output_file.parent.mkdir(parents=True, exist_ok=True)
41+
42+
# Write Markdown output
43+
with open(output_file, 'w', encoding='utf-8') as f:
44+
f.write(result)
45+
46+
if verbose:
47+
logging.info(f"Successfully converted {input_file} to {output_file}")
48+
49+
return True
50+
51+
except Exception as e:
52+
logging.error(f"Error converting {input_file}: {str(e)}")
53+
return False
54+
55+
56+
def main():
57+
"""Main CLI entry point."""
58+
parser = argparse.ArgumentParser(
59+
description="Convert TEI XML files to Markdown format using TEI2Markdown converter",
60+
formatter_class=argparse.RawDescriptionHelpFormatter,
61+
epilog="""
62+
Examples:
63+
# Convert a single TEI file
64+
python -m grobid_client.format.TEI2Markdown --input input.tei.xml --output output.md
65+
66+
# Convert with verbose logging
67+
python -m grobid_client.format.TEI2Markdown --input input.tei.xml --output output.md --verbose
68+
69+
# Convert and output to stdout
70+
python -m grobid_client.format.TEI2Markdown --input input.tei.xml
71+
"""
72+
)
73+
74+
parser.add_argument(
75+
"--input", "-i",
76+
type=Path,
77+
required=True,
78+
help="Input TEI XML file to convert"
79+
)
80+
81+
parser.add_argument(
82+
"--output", "-o",
83+
type=Path,
84+
help="Output Markdown file (if not specified, prints to stdout)"
85+
)
86+
87+
parser.add_argument(
88+
"--verbose", "-v",
89+
action="store_true",
90+
help="Enable verbose logging"
91+
)
92+
93+
args = parser.parse_args()
94+
95+
# Setup logging
96+
setup_logging(args.verbose)
97+
98+
# Validate input file
99+
if not args.input.exists():
100+
logging.error(f"Input file does not exist: {args.input}")
101+
sys.exit(1)
102+
103+
if not args.input.is_file():
104+
logging.error(f"Input path is not a file: {args.input}")
105+
sys.exit(1)
106+
107+
# Convert the file
108+
if args.output:
109+
success = convert_single_file(args.input, args.output, args.verbose)
110+
sys.exit(0 if success else 1)
111+
else:
112+
# Output to stdout
113+
try:
114+
converter = TEI2MarkdownConverter()
115+
result = converter.convert_tei_file(args.input)
116+
117+
if result is None:
118+
logging.error(f"Failed to convert {args.input}: TEI file is not well-formed or empty")
119+
sys.exit(1)
120+
121+
# Print Markdown to stdout
122+
print(result)
123+
124+
except Exception as e:
125+
logging.error(f"Error converting {args.input}: {str(e)}")
126+
sys.exit(1)
127+
128+
129+
if __name__ == "__main__":
130+
main()

grobid_client/format/__main__.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
"""
2+
Package entry point for format converters.
3+
4+
This provides a menu to choose between TEI2LossyJSON and TEI2Markdown converters.
5+
"""
6+
import argparse
7+
import sys
8+
9+
10+
def main():
11+
"""Main entry point that provides a menu for converter selection."""
12+
13+
# Check if a converter was specified
14+
if len(sys.argv) < 2:
15+
print("GROBID format converters - Choose a converter to run")
16+
print("\nUsage:")
17+
print(" python -m grobid_client.format <converter> [options]")
18+
print("\nAvailable converters:")
19+
print(" TEI2LossyJSON - Convert TEI XML to JSON format")
20+
print(" TEI2Markdown - Convert TEI XML to Markdown format")
21+
print("\nExamples:")
22+
print(" python -m grobid_client.format TEI2LossyJSON --input file.tei.xml --output output.json")
23+
print(" python -m grobid_client.format TEI2Markdown --input file.tei.xml --output output.md")
24+
print("\nGet help for specific converter:")
25+
print(" python -m grobid_client.format TEI2LossyJSON --help")
26+
print(" python -m grobid_client.format TEI2Markdown --help")
27+
sys.exit(1)
28+
29+
converter = sys.argv[1]
30+
31+
if converter == "TEI2LossyJSON":
32+
from .TEI2LossyJSON_cli import main as lossy_main
33+
# Replace sys.argv to pass remaining args to the converter
34+
sys.argv = ["TEI2LossyJSON"] + sys.argv[2:]
35+
lossy_main()
36+
elif converter == "TEI2Markdown":
37+
from .TEI2Markdown_cli import main as markdown_main
38+
# Replace sys.argv to pass remaining args to the converter
39+
sys.argv = ["TEI2Markdown"] + sys.argv[2:]
40+
markdown_main()
41+
else:
42+
print(f"Unknown converter: {converter}")
43+
print("Available converters: TEI2LossyJSON, TEI2Markdown")
44+
sys.exit(1)
45+
46+
47+
if __name__ == "__main__":
48+
main()

0 commit comments

Comments
 (0)