From 60c1feb7a5d2108fce93a0e3a4025c0f8caef562 Mon Sep 17 00:00:00 2001 From: luigi Date: Fri, 13 Feb 2026 20:57:10 +0800 Subject: [PATCH] fix(memory): eliminate memory leaks in Python bindings and inference pipeline - Add explicit destructors to BaseTokenizer, BaseModelForConditionalGeneration, and PreludeCacheDisable to properly delete heap-allocated members - Fix CoreAttention pos_helper to use heap allocation instead of stack reference - Add virtual destructors to DataReader and Processor base classes - Expose chatllm_destroy() in Python bindings and clean up callback dict references Fixes memory accumulation during repeated inference iterations. --- bindings/chatllm.py | 11 +++++++++++ src/chat.cpp | 9 +++++++++ src/chat.h | 2 +- src/layers.h | 10 +++++++++- src/models.cpp | 9 +++++++++ src/models_priv.h | 2 +- src/tokenizer.h | 4 ++++ 7 files changed, 44 insertions(+), 3 deletions(-) diff --git a/bindings/chatllm.py b/bindings/chatllm.py index 506d2109..6aa1c34f 100644 --- a/bindings/chatllm.py +++ b/bindings/chatllm.py @@ -562,6 +562,17 @@ def save_session(self, file_name: str) -> str: def load_session(self, file_name: str) -> str: return self._lib.load_session(self._chat, file_name) + def destroy(self) -> int: + if hasattr(self, "_chat") and self._chat: + if self.is_generating: self.abort() + obj_id = LibChatLLM._obj2id.get(self) + if obj_id is not None: + LibChatLLM._obj2id.pop(self, None) + LibChatLLM._id2obj.pop(obj_id, None) + self._lib.destroy(self._chat) + self._chat = None + return 0 + def callback_print_reference(self, s: str) -> None: self.references.append(s) diff --git a/src/chat.cpp b/src/chat.cpp index fd562b50..79d6cd6c 100644 --- a/src/chat.cpp +++ b/src/chat.cpp @@ -627,6 +627,15 @@ namespace chatllm qa_encoder->set_tokenizer(this); } + BaseTokenizer::~BaseTokenizer() + { + if (tp) + { + delete tp; + tp = nullptr; + } + } + void BaseTokenizer::set_chat_encoder(BaseHistoryEncoder *encoder) { chat_encoder = encoder; diff --git a/src/chat.h b/src/chat.h index c293b29f..0936e8ba 100644 --- a/src/chat.h +++ b/src/chat.h @@ -287,7 +287,7 @@ namespace chatllm BaseHistoryEncoder *qa_encoder = nullptr, BaseHistoryEncoder *completion_encoder = nullptr); - virtual ~BaseTokenizer() = default; + virtual ~BaseTokenizer(); virtual size_t load(tokenizer::DataReader *buffer, int n_vocab) = 0; diff --git a/src/layers.h b/src/layers.h index dedc91c4..275696aa 100644 --- a/src/layers.h +++ b/src/layers.h @@ -311,6 +311,14 @@ namespace chatllm PreludeCacheDisable(void): disabler(new BlockParams::DisableCache()) { } + virtual ~PreludeCacheDisable() + { + if (disabler) + { + delete disabler; + disabler = nullptr; + } + } protected: BlockParams::DisableCache *disabler; }; @@ -1421,7 +1429,7 @@ namespace chatllm sinks(BlockParams::CoreAttentionUseSinks::get() > 0 ? ggml::new_tensor_1d(ctx, ggml::type::GGML_TYPE_F32, BlockParams::CoreAttentionUseSinks::get()) : nullptr), - pos_helper(helper ? helper : &def_pos_helper) + pos_helper(helper ? helper : new BaseTensorPosHelper(max_length)) { allocate_pos_tensor(ctx); } diff --git a/src/models.cpp b/src/models.cpp index 7724fa5f..a440c1aa 100644 --- a/src/models.cpp +++ b/src/models.cpp @@ -925,6 +925,15 @@ namespace chatllm layer_ids.push_back(i); } + BaseModelForConditionalGeneration::~BaseModelForConditionalGeneration() + { + if (transformer) + { + delete transformer; + transformer = nullptr; + } + } + void BaseModelForConditionalGeneration::set_layer_ids(const std::vector &ids) { CHATLLM_CHECK((int)ids.size() == config_.num_hidden_layers) << "length(layer_ids) must be " << config_.num_hidden_layers; diff --git a/src/models_priv.h b/src/models_priv.h index 7177a079..419382ba 100644 --- a/src/models_priv.h +++ b/src/models_priv.h @@ -395,7 +395,7 @@ namespace chatllm { public: BaseModelForConditionalGeneration(ModelType model_type, BaseConfig config, const RuntimeConfig &runtime_config, size_t GRAPH_SIZE = 4096); - virtual ~BaseModelForConditionalGeneration() = default; + virtual ~BaseModelForConditionalGeneration(); void set_layer_ids(const std::vector &ids) override; int get_max_length(void) override; diff --git a/src/tokenizer.h b/src/tokenizer.h index f78eb0ca..a6d9b517 100644 --- a/src/tokenizer.h +++ b/src/tokenizer.h @@ -106,6 +106,8 @@ class TextPrepAddLeadingSpace : public TextPreprocessor class DataReader { public: + virtual ~DataReader() {} + virtual int64_t tell() = 0; virtual void seek(int64_t offset, int whence) = 0; virtual int64_t size(void) const { return _size; } @@ -136,6 +138,8 @@ class Processor vocab_.byte_fallback_ready = false; } + virtual ~Processor() {} + virtual size_t Load(DataReader *data_reader, int n_vocab) = 0; virtual int PieceToId(std::string_view piece) const;