-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathPDFexportRandomPage.sh
More file actions
executable file
·212 lines (172 loc) · 7.62 KB
/
PDFexportRandomPage.sh
File metadata and controls
executable file
·212 lines (172 loc) · 7.62 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
#!/bin/zsh
# This script selects random PDF files from a folder and all subfolders, and
# extracts image(s) of random pages from the selected PDFs. Appends all image file paths
# to a text CSV log file.
#
# Tested on OSX High Sierra 10.13.6
# Dan Bowen steamfire@gmail.com
# MIT License
#
# INPUT: list of PDF paths provided as arguments
# OUTPUT: image files, CSV text log of all runs of the command. (previous runs will be left
# in the file.)
#
# Requires:
# zshell (zsh)
# Imagemagick (for mogrify)
# poppler (for pdftocairo)
# coreutils (for realpath)
# mdls (for PDF pagecount, comes with OSX. Uses the spotlight index so that it doesn't have to process the PDFs when this command is used. Lightweight on CPU!)
#TO DO
# Output all PDF image filenames to Stdout
#"FIGURE OUT HOW TO PASS ESCAPED FILE NAMES AS THE OUTPUT DIRECTORY PATH! NOT FINISHED"
# Zero pad the page numbers in image filenames
# deal with user inputting output directory path without the trailing slash
##### NEXT LINE IS FOR DEBUGGING ONLY, DO NOT LEAVE ENABLED ####
#set -- --pdfs 1 --pages 1 --verbose /Users/admin/Dropbox/BalloonConsulting/PDF\ Workflow\ Redevelopment\ Scratch\ Folder\ 2020/Test\ Spell\ checking/2020\ PDF\ samples/
USAGE="
Program to find random PDF files and export random pages from the PDFs as image files.
Usage: PDFExportRandomPage.sh --pdfs # --pages # [--img png|jpg|tiff|pdf ] [--logdir path] [--verbose] --dryrun InputDirectory [OutputDirectory]
This requires three arguments. --pdfs, --pages, and InputDirectory
--pdfs is the quantity of PDFs that you would like it to pull randomly from the input
directory and subfolders.
--pages is the quantity of pages to pull from each PDF
InputDirectory is the path to the directory containing PDFs.
--img is the format to make the output images. Common image file suffixes work here.
--logdir is the directory to output the processing log file
--verbose prints detailed information to STDOUT console as it progresses through
the process.
--dryrun runs the parts that pick the files and pages, and shows the image filename that would have been created.
OutputDirectory is the destination directory to place the image files.
Will default to output images and log file to current directory.
Not yet sure if it works with spaces or special chars.
This will crawl through all subdirectories to look for PDFs.
"
# Use `$echoLog "whatever message here"` everywhere you print verbose logging messages to console
# By default, it is disabled and will be enabled with the `-v` or `--verbose` flags
declare echoLog='silentEcho'
#No-op function that gets executed when verbose mode is off.
function silentEcho() {
:
}
#Load the utility that has zparseopts in it, to parse the input arguments to the script
zmodload zsh/zutil
zparseopts -D -E -A opts -pdfs: -pages: -img: -verbose -dryrun -imgdir: -logdir:
#if no path was provided then print the usage information
if [ $# -lt 1 ] ; then
echo $USAGE
exit 1;
fi
#Check for the verbose flag in the list of arguments
if [[ -n ${opts[(ie)--verbose]} ]]; then
echoLog='echo'
fi
#Check for the dryrun flag in the list of arguments
if [[ -n ${opts[(ie)--dryrun]} ]]; then
dryRun=true
$echoLog -e "DRY RUN"
else
dryRun=false
fi
#Check for the image format option in the list of arguments
if [[ -n ${opts[(ie)--img]} ]]; then
OUTPUTIMAGEEXTENSION=$opts[--img]
else
OUTPUTIMAGEEXTENSION="png"
fi
#Set the output directory to current directory if none provided
if [ $# -ne 2 ] ; then
directoryToOutput="./"
else
directoryToOutput=$2
fi
OUTPUTIMAGEWIDTH=1440
totalFiles=0
currentFileStatus=0
PAGECOUNT=0
numberOfPDFs=$opts[--pdfs]
numberOfPagesPerPDF=$opts[--pages]
directoryToCrawl=$1
directoryForLogFile=$opts[--logdir]
dateNow=`/bin/date +"%Y-%m-%d"`
totalImagesToCreate=$numberOfPDFs*numberOfPagesPerPDF
#Check if the input directory exists, exit if not
if [ -d "$directoryToCrawl" ]; then
$echoLog -e "Input Directory: $directoryToCrawl\n"
else
echo "Directory Not Found: $directoryToCrawl."
exit
fi
$echoLog -e "VERBOSE MODE ON
PDF Export Random Page script
Dan Bowen 2021
MIT License\n"
$echoLog -e "Format: $OUTPUTIMAGEEXTENSION"
$echoLog -e "\nWill pull: $numberOfPagesPerPDF pages each from: $numberOfPDFs PDF files\n"
#Count files found
allPDFsFound=`find "$directoryToCrawl" -iname '*.pdf' -print`
qtyPDFsFound=`printf '%s' "$allPDFsFound" | wc -l`
#Exit if no files were found
if [ "$qtyPDFsFound" -lt 1 ] ; then
$echoLog -e "\nERROR: No PDFs found."
exit
fi
$echoLog -n "Total PDFs found: "
$echoLog -e `echo "$allPDFsFound" | wc -l`
#$echoLog -e "All pdfs list: $allPDFsFound"
SELECTEDPDFS=`find "$directoryToCrawl" -iname '*.pdf' -print | shuf -n "$numberOfPDFs"`
if [ ${?} -gt 0 ] ; then
$echoLog -e "***** Error ${?} from $SELECTEDPDFS *****\n\n"
exit
fi
$echoLog "$SELECTEDPDFS" | while read thisLine
#find "$directoryToCrawl" -iname '*.pdf' -print | sort | while read thisLine
do
targetFilePath=$thisLine
targetFileName=$targetFilePath:t
targetFileAbsolutePath=`realpath "$targetFilePath"`
$echoLog -e "PDF Selected: $targetFileName "
##Get the text printed out of the info of the PDF file
##currentFileInfo=`/usr/local/bin/pdfinfo "$targetFilePath"`
#Get page count of the current file
PAGECOUNT=`/usr/bin/mdls -raw -name kMDItemNumberOfPages "$targetFilePath"`
$echoLog -n "Total Pages in PDF: $PAGECOUNT"
#if mdls exited with an error code (greater than 1)
if [ ${?} -gt 0 ] ; then
$echoLog -e "***** MDLS Error ${?} from $targetFilePath *****\n\n"
fi
#****LOOP to process the number of pages in each PDF****
$echoLog -e ""
for ((i = 0; i < $numberOfPagesPerPDF; i++))
do
#Generate a random page number
RANDOMPAGENUMBER=`jot -r 1 1 $PAGECOUNT`
$echoLog -n " Pulling page #"
$echoLog -n "$RANDOMPAGENUMBER, "
OUTPUTIMAGEFILENAME="$targetFileName--page$RANDOMPAGENUMBER.$OUTPUTIMAGEEXTENSION"
OUTPUTIMAGEPATH="$directoryToOutput$OUTPUTIMAGEFILENAME"
$echoLog -e " Output Path: $OUTPUTIMAGEPATH"
OUTPUTHEADERTEXT="$targetFileName"
OUTPUTHEADER2TEXT="Page $RANDOMPAGENUMBER"
OUTPUTFOOTERTEXT="Extracted on $dateNow"
#check for dry run, if so, don't do the actual work
if [ "$dryRun" = false ] ; then
#Generate a jpg of the selected page without the page number appended
#pdftocairo -jpeg -jpegopt optimize=y -f $RANDOMPAGENUMBER -l $RANDOMPAGENUMBER -singlefile "$targetFilePath
#Generate a jpg of the selected page and output to STDOUT, processing with mogrify to add formatted text:
pdftocairo -png -f $RANDOMPAGENUMBER -l $RANDOMPAGENUMBER -scale-to-x $OUTPUTIMAGEWIDTH -scale-to-y -1 -singlefile "$targetFilePath" - \
| mogrify -font helvetica -fill orange -pointsize 36 -gravity north -write "$OUTPUTIMAGEFILENAME" -draw "text 0,10 '$OUTPUTHEADERTEXT'" \
-draw "text 0,50 '$OUTPUTHEADER2TEXT'"\
-pointsize 24 -gravity south -draw "text 0,10 '$OUTPUTFOOTERTEXT'" -quality 70 -
#Save the date, page number, and the path to the PDF that the page was extracted from.
$echoLog -e "$dateNow, Page:, $RANDOMPAGENUMBER, $targetFileAbsolutePath" >> pdfImageExtractionLog.txt
fi
done
$echoLog -e
(( totalFiles++ ))
done
if [ "$dryRun" = false ] ; then
$echoLog -e "\nTotal Files Processed: $totalFiles"
else
$echoLog -e "\nDry Run, Total Files that WOULD HAVE BEEN Processed: $totalFiles"
fi