wdoc/setup.py at main · thiswillbeyourgithub/wdoc · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
import shutil
import subprocess
import sys

from setuptools import find_packages, setup
from setuptools.command.install import install


class PostInstallCommand(install):
    def run(self):
        install.run(self)

        pip = ["uv", "pip"] if shutil.which("uv") else ["pip"]

        # do "python -m playwright install"
        try:
            subprocess.check_call([sys.executable, "-m", "playwright", "install"])
        except Exception as err:
            print(f"Error when installing playwright: '{err}'")

        # do pip install --user -U --pre yt-dlp
        try:
            subprocess.check_call(
                [
                    sys.executable,
                    "-m",
                ]
                + pip
                + [
                    "install",
                    "--user",
                    "-U",
                    "--pre",
                    "yt-dlp",
                ]
            )
        except Exception as err:
            print(f"Error when installing yt-dlp pre-release: '{err}'")

        # do "python -m pip install -U git+https://github.com/ahupp/python-magic/
        # see https://github.com/ahupp/python-magic/issues/261
        try:
            subprocess.check_call(
                [
                    sys.executable,
                    "-m",
                ]
                + pip
                + [
                    "install",
                    "-U",
                    "git+https://github.com/ahupp/python-magic/",
                ],
            )
        except Exception as err:
            print(f"Error when pip updating python-magic from git: '{err}'")

        # Install audioop-lts only for Python 3.13+
        # audioop was removed in Python 3.13, and pydub needs it
        # See https://github.com/jiaaro/pydub/issues/815
        if sys.version_info >= (3, 13):
            try:
                subprocess.check_call(
                    [
                        sys.executable,
                        "-m",
                    ]
                    + pip
                    + [
                        "install",
                        "-U",
                        "audioop-lts>=0.2.2",
                    ]
                )
            except Exception as err:
                print(f"Error when installing audioop-lts for Python 3.13+: '{err}'")

        # do "openparse-download"
        try:
            subprocess.check_call(
                ["openparse-download"],
            )
        except Exception as err:
            print(
                "Error when trying to run 'openparse-download' to download"
                f" weights for deep learning based table detection : '{err}'"
                "\nBy default wdoc still uses pymupdf via openparse so it "
                "shouldn't matter too much.\n"
                "For more: see https://github.com/Filimoa/open-parse/"
            )

        # do "import nltk ; nltk.download('punkt_tab')"
        try:
            import nltk

            nltk.download("punkt_tab")
        except Exception as err:
            print(f"Error when downloading nltk punkt_tab: '{err}'")


with open("README.md", "r") as readme:
    long_description = readme.read()

    # Convert icon HTML to markdown
    assert (
        '<p align="center"><img src="https://github.com/thiswillbeyourgithub/wdoc/blob/main/images/icon.png?raw=true" width="512" style="background-color: transparent !important"></p>'
        in long_description
    ), "Unexpected HTML for the icon"
    long_description = long_description.replace(
        '<p align="center"><img src="https://github.com/thiswillbeyourgithub/wdoc/blob/main/images/icon.png?raw=true" width="512" style="background-color: transparent !important"></p>',
        "![icon](https://github.com/thiswillbeyourgithub/wdoc/blob/main/images/icon.png?raw=true)",
    )

    # Convert query diagram HTML to markdown
    assert (
        '<img src="https://github.com/thiswillbeyourgithub/wdoc/blob/main/images/diagram_query.png?raw=true" alt="Query task workflow diagram showing the flow from user inputs through Raphael the Rephraser, VectorStore, Eve the Evaluator, Anna the Answerer, and recursive combining to final output" height="400">'
        in long_description
    ), "Unexpected HTML for query diagram"
    long_description = long_description.replace(
        '<img src="https://github.com/thiswillbeyourgithub/wdoc/blob/main/images/diagram_query.png?raw=true" alt="Query task workflow diagram showing the flow from user inputs through Raphael the Rephraser, VectorStore, Eve the Evaluator, Anna the Answerer, and recursive combining to final output" height="400">',
        "![Query task workflow diagram showing the flow from user inputs through Raphael the Rephraser, VectorStore, Eve the Evaluator, Anna the Answerer, and recursive combining to final output](https://github.com/thiswillbeyourgithub/wdoc/blob/main/images/diagram_query.png?raw=true)",
    )

    # Convert summary diagram HTML to markdown
    assert (
        '<img src="https://github.com/thiswillbeyourgithub/wdoc/blob/main/images/diagram_summary.png?raw=true" alt="Summary task workflow diagram showing the flow from user inputs through loading & chunking, Sam the Summarizer, concatenation to wdocSummary output" height="400">'
        in long_description
    ), "Unexpected HTML for summary diagram"
    long_description = long_description.replace(
        '<img src="https://github.com/thiswillbeyourgithub/wdoc/blob/main/images/diagram_summary.png?raw=true" alt="Summary task workflow diagram showing the flow from user inputs through loading & chunking, Sam the Summarizer, concatenation to wdocSummary output" height="400">',
        "![Summary task workflow diagram showing the flow from user inputs through loading & chunking, Sam the Summarizer, concatenation to wdocSummary output](https://github.com/thiswillbeyourgithub/wdoc/blob/main/images/diagram_summary.png?raw=true)",
    )

    # Convert search diagram HTML to markdown
    assert (
        '<img src="https://github.com/thiswillbeyourgithub/wdoc/blob/main/images/diagram_search.png?raw=true" alt="Search task workflow diagram showing the flow from user inputs through Raphael the Rephraser, VectorStore, Eve the Evaluator to search output" height="400">'
        in long_description
    ), "Unexpected HTML for search diagram"
    long_description = long_description.replace(
        '<img src="https://github.com/thiswillbeyourgithub/wdoc/blob/main/images/diagram_search.png?raw=true" alt="Search task workflow diagram showing the flow from user inputs through Raphael the Rephraser, VectorStore, Eve the Evaluator to search output" height="400">',
        "![Search task workflow diagram showing the flow from user inputs through Raphael the Rephraser, VectorStore, Eve the Evaluator to search output](https://github.com/thiswillbeyourgithub/wdoc/blob/main/images/diagram_search.png?raw=true)",
    )

    assert 'align="center"' not in long_description

setup(
    name="wdoc",
    version="5.0.0",
    description="A perfect AI powered RAG for document query and summary. Supports ~all LLM and ~all filetypes (url, pdf, epub, youtube (incl playlist), audio, anki, md, docx, pptx, or any combination!)",
    long_description=long_description,
    long_description_content_type="text/markdown",
    url="https://github.com/thiswillbeyourgithub/wdoc/",
    packages=find_packages(),
    include_package_data=True,
    classifiers=[
        "Programming Language :: Python :: 3",
        "Operating System :: OS Independent",
    ],
    license="AGPLv3",
    keywords=[
        "RAG",
        "search",
        "summary",
        "summarize",
        "pdf",
        "documents",
        "doc",
        "docx",
        "youtube",
        "mp3",
        "embeddings",
        "AI",
        "LLM",
        "openai",
        "logseq",
        "doctools",
        "doctoolsllm",
        "winston_doc",
    ],
    entry_points={
        "console_scripts": [
            "wdoc=wdoc.__main__:cli_launcher",
        ],
    },
    python_requires=">=3.11",
    install_requires=[
        "sqlalchemy>=2.0.32",
        "beautifulsoup4>=4.12.3",
        "fire>=0.6.0",
        "ftfy>=6.2.0",
        "joblib>=1.4.2",
        "langchain>=1.2.0",
        "langchain-classic>=1.0.0",
        "langchain-community>=0.3.30",
        "langchain-openai>=0.3.34",
        "langchain-litellm>=0.3.5",
        "langfuse>=3.6.1",  # for observability
        "litellm>=v1.78.2",
        "nest_asyncio>=1.6.0",  # needed to fix ollama 'event loop closed' error thanks to https://github.com/BerriAI/litellm/pull/7625/files
        "chonkie[all]>=1.4.0",  # chonkie is for the semantic embeddings
        "chonkie[semantic]>=1.4.0",
        "prompt-toolkit>=3.0.47",
        "tqdm>=4.66.4",
        "faiss-cpu>=1.8.0",
        "rich>=13.8.1",
        "beartype >= 0.22.2",
        "platformdirs >= 4.2.2",
        "dill >= 0.3.8",
        "pyfiglet >= 1.0.2",  # banner
        "rtoml >= 0.11.0",
        "loguru >= 0.7.2",
        "grandalf >= 0.8",  # to print ascii graph
        "lazy-import >= 0.2.2",
        "py_ankiconnect >= 1.1.2",  # DIY wrapper to tell anki to sync just in case
        "scikit-learn >= 1.5.1",  # for semantic reordering
        "scipy >= 1.13.1",  # for semantic reordering
        # 'python-magic >= 0.4.27',  # for detecting file type  # made optional as it can help infer the filetype, and 0.4.28 is necessary for the pipe feature.
        "uuid6 >= 2025.0.1",  # for time sortable timestamp
        "PersistDict >= 0.2.14",  # by me, like a dict but an LMDB database, to fix langchain's caches
        "nltk>=3.9.2",  # needed for punkt_tab download in post-install
        "blake3>=1.0.8",  # faster than sha256
        "pandas >= 2.3.3",
        # some loaders are included by default:
        "playwright >= 1.45.0",  # for online_media and urls
        "openparse[ml] >= 0.5.7",  # pdf with table support
        # youtube
        "yt-dlp >= 2025.09.26",  # we actually need to install yt-dlp here otherwise readthedocs crashes. Note that in the postinstall script above it will be reinstalled using the master branch
        "youtube-transcript-api >= 0.6.2",
        # "pytube >= 15.0.0",
        # url
        "tldextract>=5.1.2",
        "goose3 >= 3.1.20",
        # online search via 'filetype=web'
        "ddgs >= 9.6.0",
        "duckduckgo-search >= 8.1.1",
        # audio/video transcription
        "deepgram-sdk >= 3.2.7",
        "httpx >= 0.27.0",  # to increase deepgram timeout
        "pydub >= 0.25.1",  # extracting audio from local video
        "ffmpeg-python >= 0.2.0",  # extracting audio from local video
        "torchaudio >= 2.8.0",  # silence removal from audio
        "trio >= 0.31.0",  # for some reason older versions of trio, when present are used and cause issues on python 3.11: https://github.com/python-trio/trio/issues/2317
        # many file formats
        "unstructured[all-docs]>=0.18.15",
    ],
    extras_require={
        "full": [
            # Loaders:
            # pdf
            "pdfminer.six >= 20231228",
            "pillow_heif >= 0.16.0",
            "pypdfium2 >= 4.30.0",
            "pymupdf >= 1.24.5",
            "pdfplumber >= 0.11.1",
            "pdf2image >= 1.17.0",
            # word documents
            "docx2txt >= 0.8",
            # epub
            "pandoc >= 2.4",
            # anki
            "ankipandas>=0.3.15",
            # logseq files (I'm the dev behind it)
            "LogseqMarkdownParser >= 3.3",
        ],
        "fasttext": [
            # buggy in windows so optional: https://github.com/zafercavdar/fasttext-langdetect/issues/14
            "fasttext-langdetect >= 1.0.5",
            "langdetect >= 1.0.9",
        ],
        "pdftotext": [
            # sudo apt install build-essential libpoppler-cpp-dev pkg-config python3-dev
            "pdftotext >= 2.2.2",
        ],
        "dev": [
            "ruff >= 0.14.1",
            # "isort >= 6.0.0",
            "pre-commit >= 4.1.0",
            "pytest >= 8.3.4",
            "pytest-xdist >= 3.6.1",
            "build >= 1.2.2.post1",
            "twine >= 6.1.0",
            "bumpver >= 2025.1131",
        ],
    },
    cmdclass={
        "install": PostInstallCommand,
    },
)