From 3e1f923e1f104612760d7656cf1408b1961fdd4c Mon Sep 17 00:00:00 2001
From: simba <simba@simbas-MacBook-Pro.local>
Date: Sun, 22 Mar 2026 10:47:32 -0700
Subject: [PATCH 1/3] feat: update mlx-swift-lm to SharpAI fork main branch for
 Qwen3.5 support

---
 Package.resolved | 28 +++++++++++++++++++++++-----
 Package.swift    | 11 ++++++-----
 2 files changed, 29 insertions(+), 10 deletions(-)

diff --git a/Package.resolved b/Package.resolved
index ab776ab..b13baf8 100644
--- a/Package.resolved
+++ b/Package.resolved
@@ -9,6 +9,15 @@
         "version" : "1.32.1"
       }
     },
+    {
+      "identity" : "eventsource",
+      "kind" : "remoteSourceControl",
+      "location" : "https://github.com/mattt/EventSource.git",
+      "state" : {
+        "revision" : "a3a85a85214caf642abaa96ae664e4c772a59f6e",
+        "version" : "1.4.1"
+      }
+    },
     {
       "identity" : "hummingbird",
       "kind" : "remoteSourceControl",
@@ -30,10 +39,10 @@
     {
       "identity" : "mlx-swift-lm",
       "kind" : "remoteSourceControl",
-      "location" : "https://github.com/ml-explore/mlx-swift-lm",
+      "location" : "https://github.com/SharpAI/mlx-swift-lm",
       "state" : {
-        "revision" : "7e19e09027923d89ac47dd087d9627f610e5a91a",
-        "version" : "2.30.6"
+        "branch" : "main",
+        "revision" : "edd42fcd947eea0b19665248acf2975a28ddf58b"
       }
     },
     {
@@ -144,6 +153,15 @@
         "version" : "1.5.1"
       }
     },
+    {
+      "identity" : "swift-huggingface",
+      "kind" : "remoteSourceControl",
+      "location" : "https://github.com/huggingface/swift-huggingface.git",
+      "state" : {
+        "revision" : "b721959445b617d0bf03910b2b4aced345fd93bf",
+        "version" : "0.9.0"
+      }
+    },
     {
       "identity" : "swift-jinja",
       "kind" : "remoteSourceControl",
@@ -257,8 +275,8 @@
       "kind" : "remoteSourceControl",
       "location" : "https://github.com/huggingface/swift-transformers",
       "state" : {
-        "revision" : "150169bfba0889c229a2ce7494cf8949f18e6906",
-        "version" : "1.1.9"
+        "revision" : "eed7264ac5e4ec5dfa6165c6e5c5577364344fe4",
+        "version" : "1.2.0"
       }
     },
     {
diff --git a/Package.swift b/Package.swift
index 08e933d..1a95455 100644
--- a/Package.swift
+++ b/Package.swift
@@ -5,12 +5,13 @@ let package = Package(
     name: "mlx-server",
     platforms: [.macOS(.v14)],
     dependencies: [
-        // Apple MLX Swift — core inference engine
-        .package(url: "https://github.com/ml-explore/mlx-swift", .upToNextMinor(from: "0.30.3")),
-        // Apple's LLM library built on MLX Swift (Qwen, Llama, Mistral, Gemma etc.)
-        .package(url: "https://github.com/ml-explore/mlx-swift-lm", from: "2.0.0"),
+        // Apple MLX Swift — core inference engine (Apple-maintained, tagged releases)
+        .package(url: "https://github.com/ml-explore/mlx-swift", .upToNextMinor(from: "0.30.6")),
+        // Apple's LLM library built on MLX Swift (SharpAI fork)
+        // Pinned to main branch for Qwen3.5 support (PRs #97, #120, #129, #133, #135 — not yet in a release tag)
+        .package(url: "https://github.com/SharpAI/mlx-swift-lm", branch: "main"),
         // HuggingFace tokenizers + model download
-        .package(url: "https://github.com/huggingface/swift-transformers", .upToNextMinor(from: "1.1.0")),
+        .package(url: "https://github.com/huggingface/swift-transformers", .upToNextMinor(from: "1.2.0")),
         // Lightweight HTTP server (Apple-backed Swift server project)
         .package(url: "https://github.com/hummingbird-project/hummingbird", from: "2.0.0"),
         // Async argument parser (for CLI flags: --model, --port)

From 91ee743da6c2aeb48f031f2f72b8e6ec49c2e3ee Mon Sep 17 00:00:00 2001
From: simba <simba@simbas-MacBook-Pro.local>
Date: Sun, 22 Mar 2026 10:58:54 -0700
Subject: [PATCH 2/3] feat: add --thinking flag to disable thinking mode by
 default (Qwen3.5)

---
 Sources/mlx-server/main.swift | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/Sources/mlx-server/main.swift b/Sources/mlx-server/main.swift
index 6d8db12..2e1f4f5 100644
--- a/Sources/mlx-server/main.swift
+++ b/Sources/mlx-server/main.swift
@@ -51,6 +51,9 @@ struct MLXServer: AsyncParsableCommand {
     @Option(name: .long, help: "Number of parallel request slots")
     var parallel: Int = 1
 
+    @Flag(name: .long, help: "Enable thinking/reasoning mode (Qwen3.5 etc). Default: disabled")
+    var thinking: Bool = false
+
     mutating func run() async throws {
         print("[mlx-server] Loading model: \(model)")
         let modelId = model
@@ -72,6 +75,7 @@ struct MLXServer: AsyncParsableCommand {
         let defaultTemp = self.temp
         let defaultTopP = self.topP
         let defaultRepeatPenalty = self.repeatPenalty
+        let thinkingEnabled = self.thinking
         let parallelSlots = self.parallel
 
         // ── Concurrency limiter ──
@@ -141,7 +145,10 @@ struct MLXServer: AsyncParsableCommand {
             // ── Acquire slot (concurrency limiter) ──
             await semaphore.wait()
 
-            let userInput = UserInput(chat: chatMessages)
+            // Pass enable_thinking to the Jinja chat template via additionalContext
+            // (mirrors llama-server's --chat-template-kwargs '{"enable_thinking":false}')
+            let templateContext: [String: any Sendable]? = thinkingEnabled ? nil : ["enable_thinking": false]
+            let userInput = UserInput(chat: chatMessages, additionalContext: templateContext)
             let lmInput = try await container.prepare(input: userInput)
             let stream = try await container.generate(input: lmInput, parameters: params)
 

From 19717cfd32754b22a49a6784a87eae28c3f91bf5 Mon Sep 17 00:00:00 2001
From: simba <simba@simbas-MacBook-Pro.local>
Date: Sun, 22 Mar 2026 11:19:14 -0700
Subject: [PATCH 3/3] feat: add full OpenAI-compatible tool calling support

---
 Sources/mlx-server/main.swift | 136 +++++++++++++++++++++++++++++++---
 1 file changed, 126 insertions(+), 10 deletions(-)

diff --git a/Sources/mlx-server/main.swift b/Sources/mlx-server/main.swift
index 2e1f4f5..c56b01a 100644
--- a/Sources/mlx-server/main.swift
+++ b/Sources/mlx-server/main.swift
@@ -142,13 +142,25 @@ struct MLXServer: AsyncParsableCommand {
                 }
             }
 
+            // Convert OpenAI tools format → [String: any Sendable] for UserInput
+            let toolSpecs: [[String: any Sendable]]? = chatReq.tools?.map { tool in
+                var spec: [String: any Sendable] = ["type": tool.type]
+                var fn: [String: any Sendable] = ["name": tool.function.name]
+                if let desc = tool.function.description { fn["description"] = desc }
+                if let params = tool.function.parameters {
+                    fn["parameters"] = params.mapValues { $0.value }
+                }
+                spec["function"] = fn
+                return spec
+            }
+
             // ── Acquire slot (concurrency limiter) ──
             await semaphore.wait()
 
             // Pass enable_thinking to the Jinja chat template via additionalContext
             // (mirrors llama-server's --chat-template-kwargs '{"enable_thinking":false}')
             let templateContext: [String: any Sendable]? = thinkingEnabled ? nil : ["enable_thinking": false]
-            let userInput = UserInput(chat: chatMessages, additionalContext: templateContext)
+            let userInput = UserInput(chat: chatMessages, tools: toolSpecs, additionalContext: templateContext)
             let lmInput = try await container.prepare(input: userInput)
             let stream = try await container.generate(input: lmInput, parameters: params)
 
@@ -156,16 +168,22 @@ struct MLXServer: AsyncParsableCommand {
                 // SSE streaming
                 let (sseStream, cont) = AsyncStream<String>.makeStream()
                 Task {
+                    var hasToolCalls = false
+                    var toolCallIndex = 0
                     for await generation in stream {
                         switch generation {
                         case .chunk(let text):
                             cont.yield(sseChunk(modelId: modelId, delta: text, finishReason: nil))
+                        case .toolCall(let tc):
+                            hasToolCalls = true
+                            let argsJson = serializeToolCallArgs(tc.function.arguments)
+                            cont.yield(sseToolCallChunk(modelId: modelId, index: toolCallIndex, name: tc.function.name, arguments: argsJson))
+                            toolCallIndex += 1
                         case .info:
-                            cont.yield(sseChunk(modelId: modelId, delta: "", finishReason: "stop"))
+                            let reason = hasToolCalls ? "tool_calls" : "stop"
+                            cont.yield(sseChunk(modelId: modelId, delta: "", finishReason: reason))
                             cont.yield("data: [DONE]\n\n")
                             cont.finish()
-                        case .toolCall:
-                            break
                         }
                     }
                     cont.finish()
@@ -177,15 +195,25 @@ struct MLXServer: AsyncParsableCommand {
                     body: .init(asyncSequence: sseStream.map { ByteBuffer(string: $0) })
                 )
             } else {
-                // Non-streaming: collect all chunks
+                // Non-streaming: collect all chunks and tool calls
                 var fullText = ""
                 var completionTokenCount = 0
+                var collectedToolCalls: [ToolCallResponse] = []
+                var tcIndex = 0
                 for await generation in stream {
                     switch generation {
                     case .chunk(let text):
                         fullText += text
                         completionTokenCount += 1
-                    case .info, .toolCall:
+                    case .toolCall(let tc):
+                        let argsJson = serializeToolCallArgs(tc.function.arguments)
+                        collectedToolCalls.append(ToolCallResponse(
+                            id: "call_\(UUID().uuidString.prefix(8))",
+                            type: "function",
+                            function: ToolCallFunction(name: tc.function.name, arguments: argsJson)
+                        ))
+                        tcIndex += 1
+                    case .info:
                         break
                     }
                 }
@@ -196,6 +224,7 @@ struct MLXServer: AsyncParsableCommand {
                 let estimatedPromptTokens = max(1, promptText.count / 4)
                 let totalTokens = estimatedPromptTokens + completionTokenCount
 
+                let hasToolCalls = !collectedToolCalls.isEmpty
                 let resp = ChatCompletionResponse(
                     id: "chatcmpl-\(UUID().uuidString)",
                     model: modelId,
@@ -203,8 +232,12 @@ struct MLXServer: AsyncParsableCommand {
                     choices: [
                         Choice(
                             index: 0,
-                            message: AssistantMessage(role: "assistant", content: fullText),
-                            finishReason: "stop"
+                            message: AssistantMessage(
+                                role: "assistant",
+                                content: fullText.isEmpty && hasToolCalls ? nil : fullText,
+                                toolCalls: hasToolCalls ? collectedToolCalls : nil
+                            ),
+                            finishReason: hasToolCalls ? "tool_calls" : "stop"
                         )
                     ],
                     usage: TokenUsage(promptTokens: estimatedPromptTokens, completionTokens: completionTokenCount, totalTokens: totalTokens)
@@ -319,6 +352,41 @@ func sseChunk(modelId: String, delta: String, finishReason: String?) -> String {
     return "data: \(String(data: data, encoding: .utf8)!)\n\n"
 }
 
+func sseToolCallChunk(modelId: String, index: Int, name: String, arguments: String) -> String {
+    let chunk: [String: Any] = [
+        "id": "chatcmpl-\(UUID().uuidString)",
+        "object": "chat.completion.chunk",
+        "created": Int(Date().timeIntervalSince1970),
+        "model": modelId,
+        "choices": [[
+            "index": 0,
+            "delta": [
+                "role": "assistant",
+                "tool_calls": [[
+                    "index": index,
+                    "id": "call_\(UUID().uuidString.prefix(8))",
+                    "type": "function",
+                    "function": [
+                        "name": name,
+                        "arguments": arguments,
+                    ] as [String: Any],
+                ] as [String: Any]],
+            ] as [String: Any],
+        ] as [String: Any]]
+    ]
+    let data = try! JSONSerialization.data(withJSONObject: chunk)
+    return "data: \(String(data: data, encoding: .utf8)!)\n\n"
+}
+
+/// Serialize ToolCall arguments ([String: JSONValue]) to a JSON string
+func serializeToolCallArgs(_ args: [String: JSONValue]) -> String {
+    let anyDict = args.mapValues { $0.anyValue }
+    guard let data = try? JSONSerialization.data(withJSONObject: anyDict) else {
+        return "{}"
+    }
+    return String(data: data, encoding: .utf8) ?? "{}"
+}
+
 // ── OpenAI-compatible types ───────────────────────────────────────────────────
 
 struct ChatCompletionRequest: Decodable {
@@ -326,6 +394,15 @@ struct ChatCompletionRequest: Decodable {
         let role: String
         let content: String
     }
+    struct ToolDef: Decodable {
+        let type: String
+        let function: ToolFuncDef
+    }
+    struct ToolFuncDef: Decodable {
+        let name: String
+        let description: String?
+        let parameters: [String: AnyCodable]?
+    }
     let model: String?
     let messages: [Message]
     let stream: Bool?
@@ -333,9 +410,10 @@ struct ChatCompletionRequest: Decodable {
     let temperature: Double?
     let topP: Double?
     let repetitionPenalty: Double?
+    let tools: [ToolDef]?
 
     enum CodingKeys: String, CodingKey {
-        case model, messages, stream, temperature
+        case model, messages, stream, temperature, tools
         case maxTokens = "max_tokens"
         case topP = "top_p"
         case repetitionPenalty = "repetition_penalty"
@@ -364,7 +442,45 @@ struct Choice: Encodable {
 
 struct AssistantMessage: Encodable {
     let role: String
-    let content: String
+    let content: String?
+    let toolCalls: [ToolCallResponse]?
+
+    enum CodingKeys: String, CodingKey {
+        case role, content
+        case toolCalls = "tool_calls"
+    }
+}
+
+struct ToolCallResponse: Encodable {
+    let id: String
+    let type: String
+    let function: ToolCallFunction
+}
+
+struct ToolCallFunction: Encodable {
+    let name: String
+    let arguments: String
+}
+
+/// AnyCodable: decode arbitrary JSON for tool parameters pass-through
+struct AnyCodable: Decodable, Sendable {
+    let value: Any
+    init(from decoder: Decoder) throws {
+        let c = try decoder.singleValueContainer()
+        if c.decodeNil() { value = NSNull() }
+        else if let b = try? c.decode(Bool.self) { value = b }
+        else if let i = try? c.decode(Int.self) { value = i }
+        else if let d = try? c.decode(Double.self) { value = d }
+        else if let s = try? c.decode(String.self) { value = s }
+        else if let a = try? c.decode([AnyCodable].self) { value = a.map { $0.value } }
+        else if let d = try? c.decode([String: AnyCodable].self) { value = d.mapValues { $0.value } }
+        else { value = NSNull() }
+    }
+    // Convert back to [String: any Sendable] for ToolSpec usage
+    static func toSendable(_ dict: [String: AnyCodable]?) -> [String: any Sendable]? {
+        guard let dict else { return nil }
+        return dict.mapValues { $0.value as! any Sendable }
+    }
 }
 
 struct TokenUsage: Encodable {