diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 5eeeb28..4d7b7cb 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -6,4 +6,4 @@ on: jobs: deploy: runs-on: ubuntu-latest - steps: [uses: fastai/workflows/quarto-ghp@master] + steps: [uses: fastai/workflows/quarto-ghp3@master] diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index f0aca1e..0f94764 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -4,4 +4,4 @@ on: [workflow_dispatch, pull_request, push] jobs: test: runs-on: ubuntu-latest - steps: [uses: fastai/workflows/nbdev-ci@master] \ No newline at end of file + steps: [uses: fastai/workflows/nbdev3-ci@master] \ No newline at end of file diff --git a/clean_plot/__init__.py b/clean_plot/__init__.py index 6b06b00..ccaf968 100644 --- a/clean_plot/__init__.py +++ b/clean_plot/__init__.py @@ -1 +1,5 @@ -__version__ = "0.0.14" \ No newline at end of file +__version__ = "0.0.14" + +from .utils import download_nltk_dep + +download_nltk_dep() \ No newline at end of file diff --git a/clean_plot/_modidx.py b/clean_plot/_modidx.py index b27e0c6..645a697 100644 --- a/clean_plot/_modidx.py +++ b/clean_plot/_modidx.py @@ -1,9 +1,9 @@ # Autogenerated by nbdev d = { 'settings': { 'branch': 'master', - 'doc_baseurl': '/clean_plot/', + 'doc_baseurl': '/clean_plot', 'doc_host': 'https://deven367.github.io', - 'git_url': 'https://github.com/deven367/clean_plot/', + 'git_url': 'https://github.com/deven367/clean_plot', 'lib_path': 'clean_plot'}, 'syms': { 'clean_plot.errors': { 'clean_plot.errors.MyException': ('errors.html#myexception', 'clean_plot/errors.py'), 'clean_plot.errors.MyException.__init__': ('errors.html#myexception.__init__', 'clean_plot/errors.py'), diff --git a/clean_plot/errors.py b/clean_plot/errors.py index 43f8944..613f16a 100644 --- a/clean_plot/errors.py +++ b/clean_plot/errors.py @@ -1,9 +1,11 @@ +"""Implementations of custom errors""" + # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/07_errors.ipynb. -# %% auto 0 +# %% auto #0 __all__ = ['MyException'] -# %% ../nbs/07_errors.ipynb 3 +# %% ../nbs/07_errors.ipynb #e47f666a class MyException(Exception): def __init__(self, message): super().__init__(message) @@ -14,5 +16,5 @@ def __str__(self): return self.message -# %% ../nbs/07_errors.ipynb 4 +# %% ../nbs/07_errors.ipynb #bd0b5135 import inspect diff --git a/clean_plot/heatmaps_novels.py b/clean_plot/heatmaps_novels.py index 94a8f5a..8eb4e03 100644 --- a/clean_plot/heatmaps_novels.py +++ b/clean_plot/heatmaps_novels.py @@ -1,6 +1,8 @@ +"""This module is to create heatmaps for given books. It even includes some functions to generate smaller heatmaps""" + # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/03_heatmaps_novels.ipynb. -# %% ../nbs/03_heatmaps_novels.ipynb 3 +# %% ../nbs/03_heatmaps_novels.ipynb #496cc07d from __future__ import annotations from .utils import * from .utils import check_files @@ -16,11 +18,11 @@ import pickle from scipy.stats import zscore -# %% auto 0 +# %% auto #0 __all__ = ['heatmap_from_pkl', 'plot_novels', 'plot_histograms', 'ssms_from_pkl', 'corr_heatmaps', 'corr_ts', 'lex_ts', 'plot_standardized'] -# %% ../nbs/03_heatmaps_novels.ipynb 6 +# %% ../nbs/03_heatmaps_novels.ipynb #adfe6474 @call_parse def heatmap_from_pkl( path: str = ".", # path to pkl files @@ -121,7 +123,7 @@ def heatmap_from_pkl( print("-" * 45) -# %% ../nbs/03_heatmaps_novels.ipynb 9 +# %% ../nbs/03_heatmaps_novels.ipynb #ea177d41 @call_parse def plot_novels( path: str = None, # path for embeddings @@ -263,10 +265,10 @@ def plot_novels( del em, sim, n -# %% ../nbs/03_heatmaps_novels.ipynb 11 +# %% ../nbs/03_heatmaps_novels.ipynb #d4662490 from scipy.stats import zscore -# %% ../nbs/03_heatmaps_novels.ipynb 12 +# %% ../nbs/03_heatmaps_novels.ipynb #12edf4e8 @call_parse def plot_histograms( path: str, # path for embeddings @@ -343,10 +345,10 @@ def plot_histograms( print(f"Done plotting {title}.png") -# %% ../nbs/03_heatmaps_novels.ipynb 13 +# %% ../nbs/03_heatmaps_novels.ipynb #b80c8a10 import pandas as pd -# %% ../nbs/03_heatmaps_novels.ipynb 14 +# %% ../nbs/03_heatmaps_novels.ipynb #57b04ae7 @call_parse def ssms_from_pkl( path: str, # path for pkl file @@ -393,7 +395,7 @@ def ssms_from_pkl( plt.clf() -# %% ../nbs/03_heatmaps_novels.ipynb 15 +# %% ../nbs/03_heatmaps_novels.ipynb #1d8809d3 @call_parse def corr_heatmaps( path: str, # path for embeddings @@ -480,7 +482,7 @@ def corr_heatmaps( # plt.clf() -# %% ../nbs/03_heatmaps_novels.ipynb 16 +# %% ../nbs/03_heatmaps_novels.ipynb #b9f81ea5 @call_parse def corr_ts( path: str, # path for embeddings @@ -499,7 +501,7 @@ def corr_ts( _plot(embedding_path, data, name) -# %% ../nbs/03_heatmaps_novels.ipynb 17 +# %% ../nbs/03_heatmaps_novels.ipynb #f082e00b @call_parse def lex_ts( path: str, # path for embeddings @@ -523,7 +525,7 @@ def lex_ts( print(len(z)) -# %% ../nbs/03_heatmaps_novels.ipynb 18 +# %% ../nbs/03_heatmaps_novels.ipynb #8f9a5f21 @call_parse def plot_standardized( path: str, # path for embeddings diff --git a/clean_plot/lexical.py b/clean_plot/lexical.py index 3832618..fd0982e 100644 --- a/clean_plot/lexical.py +++ b/clean_plot/lexical.py @@ -1,9 +1,11 @@ +"""This file goes over generating lexical embeddings""" + # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/02_lexical.ipynb. -# %% auto 0 +# %% auto #0 __all__ = ['interpolate', 'load_pmi', 'load_dictionary', 'write_to_file_lexical', 'process_v2'] -# %% ../nbs/02_lexical.ipynb 3 +# %% ../nbs/02_lexical.ipynb #a99648f1 import re from . import * import os @@ -11,7 +13,7 @@ from collections import OrderedDict from fastcore.all import * -# %% ../nbs/02_lexical.ipynb 6 +# %% ../nbs/02_lexical.ipynb #92362780 def interpolate(lex, removed_indices=[]): """ Method does interpolation based on the removed indices. @@ -23,20 +25,20 @@ def interpolate(lex, removed_indices=[]): return lex -# %% ../nbs/02_lexical.ipynb 7 +# %% ../nbs/02_lexical.ipynb #7b6b4ad9 def load_pmi(path): pmi = np.load(path) return pmi -# %% ../nbs/02_lexical.ipynb 8 +# %% ../nbs/02_lexical.ipynb #9ef07dfc def load_dictionary(path): fname = open(path, "rb") data = pickle.load(fname) return data -# %% ../nbs/02_lexical.ipynb 9 +# %% ../nbs/02_lexical.ipynb #3ece596a def write_to_file_lexical(sentences, fname): with open(fname[:-4] + "_lexical.txt", "w") as f: for line in sentences: @@ -44,7 +46,7 @@ def write_to_file_lexical(sentences, fname): f.close() -# %% ../nbs/02_lexical.ipynb 10 +# %% ../nbs/02_lexical.ipynb #37e49d2d def process_v2(fname): all_data = get_data(fname) all_data = unidecode.unidecode(all_data) diff --git a/clean_plot/pickle.py b/clean_plot/pickle.py index 645f6f5..9652847 100644 --- a/clean_plot/pickle.py +++ b/clean_plot/pickle.py @@ -1,6 +1,8 @@ +"""This module contains the functions to export all the embeddings to a time series format, group them together and export it as a pickle file""" + # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/01_pickle.ipynb. -# %% ../nbs/01_pickle.ipynb 2 +# %% ../nbs/01_pickle.ipynb #cc0ae037 from __future__ import annotations import os import numpy as np @@ -11,11 +13,11 @@ from fastcore.xtras import * from fastcore.script import * -# %% auto 0 +# %% auto #0 __all__ = ['label', 'cos_sim', 'successive_similarities', 'create_dict_whole_book', 'create_label_whole_book', 'create_label', 'get_embed_method_and_name'] -# %% ../nbs/01_pickle.ipynb 4 +# %% ../nbs/01_pickle.ipynb #01c70e33 def label(method: str): # name of the method """ Returns the full name of the model based on the abbreviation @@ -41,7 +43,7 @@ def label(method: str): # name of the method return switcher.get(method) -# %% ../nbs/01_pickle.ipynb 5 +# %% ../nbs/01_pickle.ipynb #091b0d73 def cos_sim( a: np.ndarray, # vector 1 b: np.ndarray, # vector 2 @@ -52,10 +54,10 @@ def cos_sim( return dot(a, b) / (norm(a) * norm(b)) -# %% ../nbs/01_pickle.ipynb 6 +# %% ../nbs/01_pickle.ipynb #0463a325 from pathlib import Path -# %% ../nbs/01_pickle.ipynb 7 +# %% ../nbs/01_pickle.ipynb #aad409d5 def successive_similarities(embeddings, k): successive = [] for i in range(len(embeddings) - k): @@ -63,7 +65,7 @@ def successive_similarities(embeddings, k): return successive -# %% ../nbs/01_pickle.ipynb 8 +# %% ../nbs/01_pickle.ipynb #9b793316 @call_parse def create_dict_whole_book( embedding_path: str = ".", # path to the embeddings @@ -128,7 +130,7 @@ def create_dict_whole_book( print(f"Saved pkl at {new_path}") -# %% ../nbs/01_pickle.ipynb 9 +# %% ../nbs/01_pickle.ipynb #4a217262 def create_label_whole_book(method, parent_dir): # returns only the method name return label(method) @@ -137,13 +139,13 @@ def create_label_whole_book(method, parent_dir): # return parent_dir.title() + ' ' + label(method) -# %% ../nbs/01_pickle.ipynb 10 +# %% ../nbs/01_pickle.ipynb #c35ac5e6 def create_label(index, method, parent_dir): met = label(method) return "Book " + str(index + 1) + " " + parent_dir.title() + " " + met -# %% ../nbs/01_pickle.ipynb 11 +# %% ../nbs/01_pickle.ipynb #bad8ed11 def get_embed_method_and_name( fname, # name of the file ) -> (str, str): # name of file, embeddding method diff --git a/clean_plot/plot/utils.py b/clean_plot/plot/utils.py index c55f0cb..a6d101a 100644 --- a/clean_plot/plot/utils.py +++ b/clean_plot/plot/utils.py @@ -1,6 +1,8 @@ +"""This module tries to include most of the plotting functionality available in the package""" + # AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/04_plot.utils.ipynb. -# %% ../../nbs/04_plot.utils.ipynb 3 +# %% ../../nbs/04_plot.utils.ipynb #b2cde59e from __future__ import annotations from fastcore.basics import store_attr, patch_to, patch from fastcore.xtras import globtastic @@ -16,16 +18,16 @@ import matplotlib.pyplot as plt import gc -# %% auto 0 +# %% auto #0 __all__ = ['Plot'] -# %% ../../nbs/04_plot.utils.ipynb 4 +# %% ../../nbs/04_plot.utils.ipynb #42232bac sns.set_style(style="white") -# %% ../../nbs/04_plot.utils.ipynb 5 +# %% ../../nbs/04_plot.utils.ipynb #d40153c2 import inspect -# %% ../../nbs/04_plot.utils.ipynb 6 +# %% ../../nbs/04_plot.utils.ipynb #0c04e9eb class Plot: "Plotting module" @@ -119,7 +121,7 @@ def __repr__(self): return f"This object contains the path to `{self.path.absolute()}`" -# %% ../../nbs/04_plot.utils.ipynb 7 +# %% ../../nbs/04_plot.utils.ipynb #0f9376f0 @patch def get_normalized(self: Plot): "Returns the normalized ssms" @@ -150,7 +152,7 @@ def get_normalized(self: Plot): return self.norm -# %% ../../nbs/04_plot.utils.ipynb 8 +# %% ../../nbs/04_plot.utils.ipynb #6ca425a9 @patch def get_standardized(self: Plot): "Returns the standardized ssms" diff --git a/clean_plot/utils.py b/clean_plot/utils.py index 0d7fd08..e072de1 100644 --- a/clean_plot/utils.py +++ b/clean_plot/utils.py @@ -1,6 +1,8 @@ +"""Various utils for cleaning, organizing and capturing other information.""" + # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/00_utils.ipynb. -# %% ../nbs/00_utils.ipynb 4 +# %% ../nbs/00_utils.ipynb #dcda87a7 from __future__ import annotations import pickle import numpy as np @@ -12,12 +14,12 @@ from fastcore.test import test_eq from fastcore.script import call_parse -# %% auto 0 +# %% auto #0 __all__ = ['loader', 'get_data', 'load_pmi', 'load_dictionary', 'normalize', 'chelp', 'download_nltk_dep', 'split_by_newline', 'rm_useless_spaces', 'make_sentences', 'write_to_file_cleaned', 'clean', 'get_wordnet_pos', 'remove_stopwords', 'remove_punctuations', 'remove_punc_clean', 'process_for_lexical', 'num_words'] -# %% ../nbs/00_utils.ipynb 6 +# %% ../nbs/00_utils.ipynb #e7cc5669 def check_files( files, # files to validate ): @@ -37,7 +39,7 @@ def check_files( return True -# %% ../nbs/00_utils.ipynb 9 +# %% ../nbs/00_utils.ipynb #48fba8b9 @delegates(globtastic) def loader( path: str | Path, # path to a given folder, @@ -50,7 +52,7 @@ def loader( return files -# %% ../nbs/00_utils.ipynb 10 +# %% ../nbs/00_utils.ipynb #bbd7b4c2 def get_data( fname: str | Path, # path to the file ) -> str: # returns content of the file @@ -60,7 +62,7 @@ def get_data( return all_text -# %% ../nbs/00_utils.ipynb 11 +# %% ../nbs/00_utils.ipynb #73089b39 def load_pmi(fname: str | Path) -> np.ndarray: # name of pmi file # pmi matrix """ Loads the PMI matrix @@ -71,7 +73,7 @@ def load_pmi(fname: str | Path) -> np.ndarray: # name of pmi file # pmi matrix return pmi -# %% ../nbs/00_utils.ipynb 14 +# %% ../nbs/00_utils.ipynb #16658b0c def load_dictionary( fname: str, # path to the pkl file ) -> dict: # returns the contents @@ -84,7 +86,7 @@ def load_dictionary( return data -# %% ../nbs/00_utils.ipynb 15 +# %% ../nbs/00_utils.ipynb #79b028e3 def normalize( data: np.ndarray, # input array ) -> np.ndarray: # normalized array @@ -94,7 +96,7 @@ def normalize( return (data - np.min(data)) / (np.max(data) - np.min(data)) -# %% ../nbs/00_utils.ipynb 17 +# %% ../nbs/00_utils.ipynb #0a3e5d59 @call_parse def chelp(): "Show help for all console scripts" @@ -103,11 +105,11 @@ def chelp(): console_help("clean_plot") -# %% ../nbs/00_utils.ipynb 20 +# %% ../nbs/00_utils.ipynb #7855d623 import re from fastcore.script import call_parse -# %% ../nbs/00_utils.ipynb 22 +# %% ../nbs/00_utils.ipynb #1f592c78 def download_nltk_dep(): """ Downloads the `nltk` dependencies @@ -121,7 +123,7 @@ def download_nltk_dep(): nltk.download("omw-1.4") -# %% ../nbs/00_utils.ipynb 23 +# %% ../nbs/00_utils.ipynb #ac0ad0db def split_by_newline( text: str, # sentences separated by \n ) -> L: # list of sentences @@ -132,7 +134,7 @@ def split_by_newline( return L([line for line in text.split("\n") if len(line) > 0]) -# %% ../nbs/00_utils.ipynb 25 +# %% ../nbs/00_utils.ipynb #55cc0d9e def rm_useless_spaces( t: str, # sentence with extra spaces ) -> str: # sentence without extra spaces @@ -143,7 +145,7 @@ def rm_useless_spaces( return _re_space.sub(" ", t).lstrip().rstrip() -# %% ../nbs/00_utils.ipynb 27 +# %% ../nbs/00_utils.ipynb #0fff8004 def make_sentences( text: str, # bulk text ) -> L: # list of sentences @@ -158,7 +160,7 @@ def make_sentences( return L(sentences) -# %% ../nbs/00_utils.ipynb 28 +# %% ../nbs/00_utils.ipynb #71078cd4 def write_to_file_cleaned( sentences: list, # list of sentences fname: str, # name of output file @@ -172,7 +174,7 @@ def write_to_file_cleaned( f.close() -# %% ../nbs/00_utils.ipynb 29 +# %% ../nbs/00_utils.ipynb #16ddbd13 @call_parse def clean( fname: str, # name of input txt file @@ -185,16 +187,16 @@ def clean( write_to_file_cleaned(sentences, fname) -# %% ../nbs/00_utils.ipynb 30 +# %% ../nbs/00_utils.ipynb #a9abe056 import nltk from nltk.tokenize import word_tokenize, sent_tokenize from nltk.corpus import wordnet, stopwords from nltk.stem import WordNetLemmatizer -# %% ../nbs/00_utils.ipynb 35 +# %% ../nbs/00_utils.ipynb #bff72363 import unidecode -# %% ../nbs/00_utils.ipynb 38 +# %% ../nbs/00_utils.ipynb #6e74f1b5 def get_wordnet_pos( word: str, # input word token ) -> str: # POS of the given word @@ -210,10 +212,10 @@ def get_wordnet_pos( return tag_dict.get(tag, wordnet.NOUN) -# %% ../nbs/00_utils.ipynb 39 +# %% ../nbs/00_utils.ipynb #fcbb21cc from nltk.corpus import stopwords -# %% ../nbs/00_utils.ipynb 40 +# %% ../nbs/00_utils.ipynb #7356fdd9 def remove_stopwords( sentence: str, # input sentence ) -> str: # output sentence @@ -228,7 +230,7 @@ def remove_stopwords( return " ".join(sentences) -# %% ../nbs/00_utils.ipynb 41 +# %% ../nbs/00_utils.ipynb #de77b6cf def remove_punctuations( sentence: str, # input sentence ) -> str: # output sentence @@ -244,7 +246,7 @@ def remove_punctuations( return doc -# %% ../nbs/00_utils.ipynb 42 +# %% ../nbs/00_utils.ipynb #9fd4ef63 def remove_punc_clean( sentence: str, # input sentence lemmatize: bool = False, # flag to `lemmatize` @@ -265,7 +267,7 @@ def remove_punc_clean( return doc -# %% ../nbs/00_utils.ipynb 44 +# %% ../nbs/00_utils.ipynb #6a69f4f1 def process_for_lexical( fname: str, # name of the input txt file ) -> L: # @@ -287,14 +289,14 @@ def process_for_lexical( return L(removed_sentences) -# %% ../nbs/00_utils.ipynb 56 +# %% ../nbs/00_utils.ipynb #623b078e def num_words( sentence: str, # input sentence ) -> int: # number of words "Returns the number of words in a sentence" return len(remove_punctuations(sentence).split()) -# %% ../nbs/00_utils.ipynb 61 +# %% ../nbs/00_utils.ipynb #4e6941ec @patch(as_prop=True) def shape(self: Path): name = str(self) @@ -302,14 +304,14 @@ def shape(self: Path): return np.load(self).shape raise AssertionError('not a npy array') -# %% ../nbs/00_utils.ipynb 68 +# %% ../nbs/00_utils.ipynb #0dfdef55 @patch(as_prop=True) def text(self: Path): if str(self).endswith('.txt'): with open(self) as f: return f.read() raise AssertionError('not a txt file') -# %% ../nbs/00_utils.ipynb 71 +# %% ../nbs/00_utils.ipynb #eac82154 @patch(as_prop=True) def sentences(self: Path): name = str(self) diff --git a/nbs/01_pickle.ipynb b/nbs/01_pickle.ipynb index 5cc11ed..020ae5e 100644 --- a/nbs/01_pickle.ipynb +++ b/nbs/01_pickle.ipynb @@ -3,6 +3,7 @@ { "cell_type": "code", "execution_count": null, + "id": "001ff9c7", "metadata": {}, "outputs": [], "source": [ @@ -11,6 +12,7 @@ }, { "cell_type": "markdown", + "id": "665c9931", "metadata": {}, "source": [ "# Exporting to pickle\n", @@ -21,6 +23,7 @@ { "cell_type": "code", "execution_count": null, + "id": "cc0ae037", "metadata": {}, "outputs": [], "source": [ @@ -38,6 +41,7 @@ }, { "cell_type": "markdown", + "id": "214f6fbb", "metadata": {}, "source": [ "**NOTE** The module has 2 main functions:\n", @@ -49,6 +53,7 @@ { "cell_type": "code", "execution_count": null, + "id": "01c70e33", "metadata": {}, "outputs": [], "source": [ @@ -81,6 +86,7 @@ { "cell_type": "code", "execution_count": null, + "id": "091b0d73", "metadata": {}, "outputs": [], "source": [ @@ -98,6 +104,7 @@ { "cell_type": "code", "execution_count": null, + "id": "0463a325", "metadata": {}, "outputs": [], "source": [ @@ -108,6 +115,7 @@ { "cell_type": "code", "execution_count": null, + "id": "aad409d5", "metadata": {}, "outputs": [], "source": [ @@ -122,6 +130,7 @@ { "cell_type": "code", "execution_count": null, + "id": "9b793316", "metadata": {}, "outputs": [], "source": [ @@ -193,6 +202,7 @@ { "cell_type": "code", "execution_count": null, + "id": "4a217262", "metadata": {}, "outputs": [], "source": [ @@ -208,6 +218,7 @@ { "cell_type": "code", "execution_count": null, + "id": "c35ac5e6", "metadata": {}, "outputs": [], "source": [ @@ -220,6 +231,7 @@ { "cell_type": "code", "execution_count": null, + "id": "bad8ed11", "metadata": {}, "outputs": [], "source": [ @@ -238,6 +250,7 @@ { "cell_type": "code", "execution_count": null, + "id": "2804f76d", "metadata": {}, "outputs": [ { @@ -256,6 +269,7 @@ { "cell_type": "code", "execution_count": null, + "id": "cb5451f8", "metadata": {}, "outputs": [ { @@ -294,5 +308,5 @@ } }, "nbformat": 4, - "nbformat_minor": 4 + "nbformat_minor": 5 } diff --git a/nbs/_not_finished.ipynb b/nbs/_not_finished.ipynb index ad1cb1b..3015aec 100644 --- a/nbs/_not_finished.ipynb +++ b/nbs/_not_finished.ipynb @@ -3,6 +3,7 @@ { "cell_type": "code", "execution_count": null, + "id": "03902627", "metadata": {}, "outputs": [], "source": [ @@ -14,6 +15,7 @@ { "cell_type": "code", "execution_count": null, + "id": "7709e349", "metadata": {}, "outputs": [], "source": [ @@ -23,6 +25,7 @@ { "cell_type": "code", "execution_count": null, + "id": "597a9e8e", "metadata": {}, "outputs": [], "source": [ @@ -34,6 +37,7 @@ { "cell_type": "code", "execution_count": null, + "id": "0f485950", "metadata": {}, "outputs": [], "source": [ @@ -43,6 +47,7 @@ { "cell_type": "code", "execution_count": null, + "id": "de4b4780", "metadata": {}, "outputs": [ { @@ -60,6 +65,7 @@ { "cell_type": "code", "execution_count": null, + "id": "3d273bf8", "metadata": {}, "outputs": [ { @@ -86,6 +92,7 @@ { "cell_type": "code", "execution_count": null, + "id": "964d3067", "metadata": {}, "outputs": [], "source": [ @@ -101,6 +108,7 @@ { "cell_type": "code", "execution_count": null, + "id": "7f9c1239", "metadata": {}, "outputs": [], "source": [] @@ -114,5 +122,5 @@ } }, "nbformat": 4, - "nbformat_minor": 4 + "nbformat_minor": 5 } diff --git a/nbs/index.ipynb b/nbs/index.ipynb index 27f3786..48842c9 100644 --- a/nbs/index.ipynb +++ b/nbs/index.ipynb @@ -3,6 +3,7 @@ { "cell_type": "code", "execution_count": null, + "id": "e08b8784", "metadata": {}, "outputs": [], "source": [ @@ -14,6 +15,7 @@ { "cell_type": "code", "execution_count": null, + "id": "2ed2b018", "metadata": {}, "outputs": [], "source": [ @@ -25,6 +27,7 @@ { "cell_type": "code", "execution_count": null, + "id": "4995d1a8", "metadata": {}, "outputs": [], "source": [ @@ -45,6 +48,7 @@ }, { "cell_type": "markdown", + "id": "5322f50b", "metadata": {}, "source": [ "# Welcome to clean_plot\n", @@ -54,6 +58,7 @@ }, { "cell_type": "markdown", + "id": "3d79bd10", "metadata": {}, "source": [ "![CI](https://github.com/deven367/clean_plot/actions/workflows/test.yaml/badge.svg) ![Deploy to GitHub Pages](https://github.com/deven367/clean_plot/actions/workflows/deploy.yml/badge.svg)" @@ -61,6 +66,7 @@ }, { "cell_type": "markdown", + "id": "894118e2", "metadata": {}, "source": [ "## Install" @@ -68,6 +74,7 @@ }, { "cell_type": "markdown", + "id": "a6c1543f", "metadata": {}, "source": [ "The easiest way to install the library is to simply do a `pip` install. " @@ -75,6 +82,7 @@ }, { "cell_type": "markdown", + "id": "3f66e168", "metadata": {}, "source": [ "```python\n", @@ -84,6 +92,7 @@ }, { "cell_type": "markdown", + "id": "eab6c9df", "metadata": {}, "source": [ "Another way to install the library would be to build from source. It is more likely that the released version may contain bugs. The source would get updated more often. If you plan to add features to `clean_plot` yourself, or want to be on the cutting edge, you can use an editable install." @@ -91,6 +100,7 @@ }, { "cell_type": "markdown", + "id": "0c890603", "metadata": {}, "source": [ "```sh\n", @@ -102,6 +112,7 @@ }, { "cell_type": "markdown", + "id": "461b34e2", "metadata": {}, "source": [ "## How to use" @@ -109,6 +120,7 @@ }, { "cell_type": "markdown", + "id": "ba23426b", "metadata": {}, "source": [ "The library contains easy to use methods for cleaning text, tokenizing and lemmatizing sentences. These sentences can then be easily fed to a sentence encoder to create sentence embeddings." @@ -117,6 +129,7 @@ { "cell_type": "code", "execution_count": null, + "id": "a3867d02", "metadata": {}, "outputs": [ { @@ -154,6 +167,7 @@ { "cell_type": "code", "execution_count": null, + "id": "a21aac8d", "metadata": {}, "outputs": [], "source": [ @@ -163,6 +177,7 @@ { "cell_type": "code", "execution_count": null, + "id": "902bed5c", "metadata": {}, "outputs": [ { @@ -183,6 +198,7 @@ { "cell_type": "code", "execution_count": null, + "id": "a94d5761", "metadata": {}, "outputs": [], "source": [ @@ -195,6 +211,7 @@ { "cell_type": "code", "execution_count": null, + "id": "d81dccc6", "metadata": {}, "outputs": [ { @@ -224,6 +241,7 @@ }, { "cell_type": "markdown", + "id": "5c231763", "metadata": {}, "source": [ "## Help" @@ -231,6 +249,7 @@ }, { "cell_type": "markdown", + "id": "5f2f35ec", "metadata": {}, "source": [ "To see the various CLI available in the library, use the function `cp_help`" @@ -239,6 +258,7 @@ { "cell_type": "code", "execution_count": null, + "id": "5e593652", "metadata": {}, "outputs": [ { @@ -263,6 +283,7 @@ }, { "cell_type": "markdown", + "id": "947bb862", "metadata": {}, "source": [ "## Contributing" @@ -270,6 +291,7 @@ }, { "cell_type": "markdown", + "id": "ad0705db", "metadata": {}, "source": [ "This library has come into existence because of [nbdev](https://nbdev.fast.ai/) (one of many amazing tools made by [fast.ai](https://www.fast.ai/)). PRs and Issues are encouraged. \n", @@ -292,5 +314,5 @@ } }, "nbformat": 4, - "nbformat_minor": 1 + "nbformat_minor": 5 } diff --git a/pyproject.toml b/pyproject.toml index aef3009..aa6df4c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,53 @@ +[build-system] +requires = ["setuptools>=64"] +build-backend = "setuptools.build_meta" + +[project] +name = "clean_plot" +dynamic = ["version"] +description = "clean_plot simplifies cleaning text files for creation of embeddings and making plots from it" +readme = "README.md" +requires-python = ">=3.7" +license = {text = "Apache-2.0"} +authors = [{name = "Deven Mistry", email = "masterdeven@gmail.com"}] +keywords = ['cleaning', 'tokenizing', 'embeddings', 'plotting'] +classifiers = ["Natural Language :: English", "Intended Audience :: Developers", "Development Status :: 2 - Pre-Alpha", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3 :: Only"] +dependencies = ['numpy', 'pandas', 'seaborn', 'scikit-learn', 'nltk', 'unidecode', 'fastcore'] + +[project.urls] +Repository = "https://github.com/deven367/clean_plot/" +Documentation = "https://deven367.github.io/clean_plot/" + +[project.entry-points.nbdev] +clean_plot = "clean_plot._modidx:d" + +[project.optional-dependencies] +dev = ['nbdev', 'notebook', 'black'] + + +[project.scripts] +heatmaps = "clean_plot.heatmaps_novels:plot_novels" +corr_hm = "clean_plot.heatmaps_novels:corr_heatmaps" +lex_ts = "clean_plot.heatmaps_novels:lex_ts" +heatmaps_pkl = "clean_plot.heatmaps_novels:ssms_from_pkl" +clean_file = "clean_plot.utils:clean" +histograms = "clean_plot.heatmaps_novels:plot_histograms" +make_pkl = "clean_plot.pickle:create_dict_whole_book" +ts_pkl = "clean_plot.heatmaps_novels:heatmap_from_pkl" +cp_help = "clean_plot.utils:chelp" + +[tool.setuptools.dynamic] +version = {attr = "clean_plot.__version__"} + +[tool.setuptools.packages.find] +include = ["clean_plot"] + +[tool.nbdev] +branch = 'master' +tst_flags = 'slow colab local' +jupyter_hooks = true +custom_sidebar = true + [tool.black] line-length = 80 target-version = ['py36', 'py37', 'py38', 'py39'] diff --git a/settings.ini b/settings.ini deleted file mode 100644 index 1a762ed..0000000 --- a/settings.ini +++ /dev/null @@ -1,46 +0,0 @@ -[DEFAULT] -host = github -lib_name = clean_plot -user = deven367 -description = clean_plot simplifies cleaning text files for creation of embeddings and making plots from it -keywords = cleaning tokenizing embeddings plotting -author = Deven Mistry -author_email = masterdeven@gmail.com -copyright = Deven Mistry -branch = master -version = 0.0.14 -min_python = 3.7 -audience = Developers -language = English -custom_sidebar = True -license = apache2 -status = 2 -requirements = numpy pandas seaborn scikit-learn nltk unidecode fastcore -dev_requirements = nbdev notebook black -nbs_path = nbs -doc_path = _docs -recursive = True -repo = clean_plot -doc_host = https://%(user)s.github.io -doc_baseurl = /%(lib_name)s/ -git_url = https://github.com/%(user)s/%(repo)s/ -lib_path = %(lib_name)s -title = %(lib_name)s -monospace_docstrings = True -tst_flags = slow colab local -console_scripts = heatmaps=clean_plot.heatmaps_novels:plot_novels - corr_hm=clean_plot.heatmaps_novels:corr_heatmaps - lex_ts=clean_plot.heatmaps_novels:lex_ts - heatmaps_pkl=clean_plot.heatmaps_novels:ssms_from_pkl - clean_file=clean_plot.utils:clean - histograms=clean_plot.heatmaps_novels:plot_histograms - make_pkl=clean_plot.pickle:create_dict_whole_book - ts_pkl=clean_plot.heatmaps_novels:heatmap_from_pkl - cp_help=clean_plot.utils:chelp -black_formatting = False -readme_nb = index.ipynb -allowed_metadata_keys = -allowed_cell_metadata_keys = -jupyter_hooks = True -clean_ids = True - diff --git a/setup.py b/setup.py deleted file mode 100644 index 5cf5922..0000000 --- a/setup.py +++ /dev/null @@ -1,57 +0,0 @@ -from pkg_resources import parse_version -from configparser import ConfigParser -import setuptools -assert parse_version(setuptools.__version__)>=parse_version('36.2') - -# note: all settings are in settings.ini; edit there, not here -config = ConfigParser(delimiters=['=']) -config.read('settings.ini') -cfg = config['DEFAULT'] - -cfg_keys = 'version description keywords author author_email'.split() -expected = cfg_keys + "lib_name user branch license status min_python audience language".split() -for o in expected: assert o in cfg, "missing expected setting: {}".format(o) -setup_cfg = {o:cfg[o] for o in cfg_keys} - -licenses = { - 'apache2': ('Apache Software License 2.0','OSI Approved :: Apache Software License'), - 'mit': ('MIT License', 'OSI Approved :: MIT License'), - 'gpl2': ('GNU General Public License v2', 'OSI Approved :: GNU General Public License v2 (GPLv2)'), - 'gpl3': ('GNU General Public License v3', 'OSI Approved :: GNU General Public License v3 (GPLv3)'), - 'bsd3': ('BSD License', 'OSI Approved :: BSD License'), -} -statuses = [ '1 - Planning', '2 - Pre-Alpha', '3 - Alpha', - '4 - Beta', '5 - Production/Stable', '6 - Mature', '7 - Inactive' ] -py_versions = '3.6 3.7 3.8 3.9 3.10'.split() - -requirements = cfg.get('requirements','').split() -if cfg.get('pip_requirements'): requirements += cfg.get('pip_requirements','').split() -min_python = cfg['min_python'] -lic = licenses.get(cfg['license'].lower(), (cfg['license'], None)) -dev_requirements = (cfg.get('dev_requirements') or '').split() - -setuptools.setup( - name = cfg['lib_name'], - license = lic[0], - classifiers = [ - 'Development Status :: ' + statuses[int(cfg['status'])], - 'Intended Audience :: ' + cfg['audience'].title(), - 'Natural Language :: ' + cfg['language'].title(), - ] + ['Programming Language :: Python :: '+o for o in py_versions[py_versions.index(min_python):]] + (['License :: ' + lic[1] ] if lic[1] else []), - url = cfg['git_url'], - packages = setuptools.find_packages(), - include_package_data = True, - install_requires = requirements, - extras_require={ 'dev': dev_requirements }, - dependency_links = cfg.get('dep_links','').split(), - python_requires = '>=' + cfg['min_python'], - long_description = open('README.md').read(), - long_description_content_type = 'text/markdown', - zip_safe = False, - entry_points = { - 'console_scripts': cfg.get('console_scripts','').split(), - 'nbdev': [f'{cfg.get("lib_path")}={cfg.get("lib_path")}._modidx:d'] - }, - **setup_cfg) - -