From 3e1f923e1f104612760d7656cf1408b1961fdd4c Mon Sep 17 00:00:00 2001 From: simba Date: Sun, 22 Mar 2026 10:47:32 -0700 Subject: [PATCH 1/3] feat: update mlx-swift-lm to SharpAI fork main branch for Qwen3.5 support --- Package.resolved | 28 +++++++++++++++++++++++----- Package.swift | 11 ++++++----- 2 files changed, 29 insertions(+), 10 deletions(-) diff --git a/Package.resolved b/Package.resolved index ab776ab..b13baf8 100644 --- a/Package.resolved +++ b/Package.resolved @@ -9,6 +9,15 @@ "version" : "1.32.1" } }, + { + "identity" : "eventsource", + "kind" : "remoteSourceControl", + "location" : "https://github.com/mattt/EventSource.git", + "state" : { + "revision" : "a3a85a85214caf642abaa96ae664e4c772a59f6e", + "version" : "1.4.1" + } + }, { "identity" : "hummingbird", "kind" : "remoteSourceControl", @@ -30,10 +39,10 @@ { "identity" : "mlx-swift-lm", "kind" : "remoteSourceControl", - "location" : "https://github.com/ml-explore/mlx-swift-lm", + "location" : "https://github.com/SharpAI/mlx-swift-lm", "state" : { - "revision" : "7e19e09027923d89ac47dd087d9627f610e5a91a", - "version" : "2.30.6" + "branch" : "main", + "revision" : "edd42fcd947eea0b19665248acf2975a28ddf58b" } }, { @@ -144,6 +153,15 @@ "version" : "1.5.1" } }, + { + "identity" : "swift-huggingface", + "kind" : "remoteSourceControl", + "location" : "https://github.com/huggingface/swift-huggingface.git", + "state" : { + "revision" : "b721959445b617d0bf03910b2b4aced345fd93bf", + "version" : "0.9.0" + } + }, { "identity" : "swift-jinja", "kind" : "remoteSourceControl", @@ -257,8 +275,8 @@ "kind" : "remoteSourceControl", "location" : "https://github.com/huggingface/swift-transformers", "state" : { - "revision" : "150169bfba0889c229a2ce7494cf8949f18e6906", - "version" : "1.1.9" + "revision" : "eed7264ac5e4ec5dfa6165c6e5c5577364344fe4", + "version" : "1.2.0" } }, { diff --git a/Package.swift b/Package.swift index 08e933d..1a95455 100644 --- a/Package.swift +++ b/Package.swift @@ -5,12 +5,13 @@ let package = Package( name: "mlx-server", platforms: [.macOS(.v14)], dependencies: [ - // Apple MLX Swift — core inference engine - .package(url: "https://github.com/ml-explore/mlx-swift", .upToNextMinor(from: "0.30.3")), - // Apple's LLM library built on MLX Swift (Qwen, Llama, Mistral, Gemma etc.) - .package(url: "https://github.com/ml-explore/mlx-swift-lm", from: "2.0.0"), + // Apple MLX Swift — core inference engine (Apple-maintained, tagged releases) + .package(url: "https://github.com/ml-explore/mlx-swift", .upToNextMinor(from: "0.30.6")), + // Apple's LLM library built on MLX Swift (SharpAI fork) + // Pinned to main branch for Qwen3.5 support (PRs #97, #120, #129, #133, #135 — not yet in a release tag) + .package(url: "https://github.com/SharpAI/mlx-swift-lm", branch: "main"), // HuggingFace tokenizers + model download - .package(url: "https://github.com/huggingface/swift-transformers", .upToNextMinor(from: "1.1.0")), + .package(url: "https://github.com/huggingface/swift-transformers", .upToNextMinor(from: "1.2.0")), // Lightweight HTTP server (Apple-backed Swift server project) .package(url: "https://github.com/hummingbird-project/hummingbird", from: "2.0.0"), // Async argument parser (for CLI flags: --model, --port) From 91ee743da6c2aeb48f031f2f72b8e6ec49c2e3ee Mon Sep 17 00:00:00 2001 From: simba Date: Sun, 22 Mar 2026 10:58:54 -0700 Subject: [PATCH 2/3] feat: add --thinking flag to disable thinking mode by default (Qwen3.5) --- Sources/mlx-server/main.swift | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/Sources/mlx-server/main.swift b/Sources/mlx-server/main.swift index 6d8db12..2e1f4f5 100644 --- a/Sources/mlx-server/main.swift +++ b/Sources/mlx-server/main.swift @@ -51,6 +51,9 @@ struct MLXServer: AsyncParsableCommand { @Option(name: .long, help: "Number of parallel request slots") var parallel: Int = 1 + @Flag(name: .long, help: "Enable thinking/reasoning mode (Qwen3.5 etc). Default: disabled") + var thinking: Bool = false + mutating func run() async throws { print("[mlx-server] Loading model: \(model)") let modelId = model @@ -72,6 +75,7 @@ struct MLXServer: AsyncParsableCommand { let defaultTemp = self.temp let defaultTopP = self.topP let defaultRepeatPenalty = self.repeatPenalty + let thinkingEnabled = self.thinking let parallelSlots = self.parallel // ── Concurrency limiter ── @@ -141,7 +145,10 @@ struct MLXServer: AsyncParsableCommand { // ── Acquire slot (concurrency limiter) ── await semaphore.wait() - let userInput = UserInput(chat: chatMessages) + // Pass enable_thinking to the Jinja chat template via additionalContext + // (mirrors llama-server's --chat-template-kwargs '{"enable_thinking":false}') + let templateContext: [String: any Sendable]? = thinkingEnabled ? nil : ["enable_thinking": false] + let userInput = UserInput(chat: chatMessages, additionalContext: templateContext) let lmInput = try await container.prepare(input: userInput) let stream = try await container.generate(input: lmInput, parameters: params) From 19717cfd32754b22a49a6784a87eae28c3f91bf5 Mon Sep 17 00:00:00 2001 From: simba Date: Sun, 22 Mar 2026 11:19:14 -0700 Subject: [PATCH 3/3] feat: add full OpenAI-compatible tool calling support --- Sources/mlx-server/main.swift | 136 +++++++++++++++++++++++++++++++--- 1 file changed, 126 insertions(+), 10 deletions(-) diff --git a/Sources/mlx-server/main.swift b/Sources/mlx-server/main.swift index 2e1f4f5..c56b01a 100644 --- a/Sources/mlx-server/main.swift +++ b/Sources/mlx-server/main.swift @@ -142,13 +142,25 @@ struct MLXServer: AsyncParsableCommand { } } + // Convert OpenAI tools format → [String: any Sendable] for UserInput + let toolSpecs: [[String: any Sendable]]? = chatReq.tools?.map { tool in + var spec: [String: any Sendable] = ["type": tool.type] + var fn: [String: any Sendable] = ["name": tool.function.name] + if let desc = tool.function.description { fn["description"] = desc } + if let params = tool.function.parameters { + fn["parameters"] = params.mapValues { $0.value } + } + spec["function"] = fn + return spec + } + // ── Acquire slot (concurrency limiter) ── await semaphore.wait() // Pass enable_thinking to the Jinja chat template via additionalContext // (mirrors llama-server's --chat-template-kwargs '{"enable_thinking":false}') let templateContext: [String: any Sendable]? = thinkingEnabled ? nil : ["enable_thinking": false] - let userInput = UserInput(chat: chatMessages, additionalContext: templateContext) + let userInput = UserInput(chat: chatMessages, tools: toolSpecs, additionalContext: templateContext) let lmInput = try await container.prepare(input: userInput) let stream = try await container.generate(input: lmInput, parameters: params) @@ -156,16 +168,22 @@ struct MLXServer: AsyncParsableCommand { // SSE streaming let (sseStream, cont) = AsyncStream.makeStream() Task { + var hasToolCalls = false + var toolCallIndex = 0 for await generation in stream { switch generation { case .chunk(let text): cont.yield(sseChunk(modelId: modelId, delta: text, finishReason: nil)) + case .toolCall(let tc): + hasToolCalls = true + let argsJson = serializeToolCallArgs(tc.function.arguments) + cont.yield(sseToolCallChunk(modelId: modelId, index: toolCallIndex, name: tc.function.name, arguments: argsJson)) + toolCallIndex += 1 case .info: - cont.yield(sseChunk(modelId: modelId, delta: "", finishReason: "stop")) + let reason = hasToolCalls ? "tool_calls" : "stop" + cont.yield(sseChunk(modelId: modelId, delta: "", finishReason: reason)) cont.yield("data: [DONE]\n\n") cont.finish() - case .toolCall: - break } } cont.finish() @@ -177,15 +195,25 @@ struct MLXServer: AsyncParsableCommand { body: .init(asyncSequence: sseStream.map { ByteBuffer(string: $0) }) ) } else { - // Non-streaming: collect all chunks + // Non-streaming: collect all chunks and tool calls var fullText = "" var completionTokenCount = 0 + var collectedToolCalls: [ToolCallResponse] = [] + var tcIndex = 0 for await generation in stream { switch generation { case .chunk(let text): fullText += text completionTokenCount += 1 - case .info, .toolCall: + case .toolCall(let tc): + let argsJson = serializeToolCallArgs(tc.function.arguments) + collectedToolCalls.append(ToolCallResponse( + id: "call_\(UUID().uuidString.prefix(8))", + type: "function", + function: ToolCallFunction(name: tc.function.name, arguments: argsJson) + )) + tcIndex += 1 + case .info: break } } @@ -196,6 +224,7 @@ struct MLXServer: AsyncParsableCommand { let estimatedPromptTokens = max(1, promptText.count / 4) let totalTokens = estimatedPromptTokens + completionTokenCount + let hasToolCalls = !collectedToolCalls.isEmpty let resp = ChatCompletionResponse( id: "chatcmpl-\(UUID().uuidString)", model: modelId, @@ -203,8 +232,12 @@ struct MLXServer: AsyncParsableCommand { choices: [ Choice( index: 0, - message: AssistantMessage(role: "assistant", content: fullText), - finishReason: "stop" + message: AssistantMessage( + role: "assistant", + content: fullText.isEmpty && hasToolCalls ? nil : fullText, + toolCalls: hasToolCalls ? collectedToolCalls : nil + ), + finishReason: hasToolCalls ? "tool_calls" : "stop" ) ], usage: TokenUsage(promptTokens: estimatedPromptTokens, completionTokens: completionTokenCount, totalTokens: totalTokens) @@ -319,6 +352,41 @@ func sseChunk(modelId: String, delta: String, finishReason: String?) -> String { return "data: \(String(data: data, encoding: .utf8)!)\n\n" } +func sseToolCallChunk(modelId: String, index: Int, name: String, arguments: String) -> String { + let chunk: [String: Any] = [ + "id": "chatcmpl-\(UUID().uuidString)", + "object": "chat.completion.chunk", + "created": Int(Date().timeIntervalSince1970), + "model": modelId, + "choices": [[ + "index": 0, + "delta": [ + "role": "assistant", + "tool_calls": [[ + "index": index, + "id": "call_\(UUID().uuidString.prefix(8))", + "type": "function", + "function": [ + "name": name, + "arguments": arguments, + ] as [String: Any], + ] as [String: Any]], + ] as [String: Any], + ] as [String: Any]] + ] + let data = try! JSONSerialization.data(withJSONObject: chunk) + return "data: \(String(data: data, encoding: .utf8)!)\n\n" +} + +/// Serialize ToolCall arguments ([String: JSONValue]) to a JSON string +func serializeToolCallArgs(_ args: [String: JSONValue]) -> String { + let anyDict = args.mapValues { $0.anyValue } + guard let data = try? JSONSerialization.data(withJSONObject: anyDict) else { + return "{}" + } + return String(data: data, encoding: .utf8) ?? "{}" +} + // ── OpenAI-compatible types ─────────────────────────────────────────────────── struct ChatCompletionRequest: Decodable { @@ -326,6 +394,15 @@ struct ChatCompletionRequest: Decodable { let role: String let content: String } + struct ToolDef: Decodable { + let type: String + let function: ToolFuncDef + } + struct ToolFuncDef: Decodable { + let name: String + let description: String? + let parameters: [String: AnyCodable]? + } let model: String? let messages: [Message] let stream: Bool? @@ -333,9 +410,10 @@ struct ChatCompletionRequest: Decodable { let temperature: Double? let topP: Double? let repetitionPenalty: Double? + let tools: [ToolDef]? enum CodingKeys: String, CodingKey { - case model, messages, stream, temperature + case model, messages, stream, temperature, tools case maxTokens = "max_tokens" case topP = "top_p" case repetitionPenalty = "repetition_penalty" @@ -364,7 +442,45 @@ struct Choice: Encodable { struct AssistantMessage: Encodable { let role: String - let content: String + let content: String? + let toolCalls: [ToolCallResponse]? + + enum CodingKeys: String, CodingKey { + case role, content + case toolCalls = "tool_calls" + } +} + +struct ToolCallResponse: Encodable { + let id: String + let type: String + let function: ToolCallFunction +} + +struct ToolCallFunction: Encodable { + let name: String + let arguments: String +} + +/// AnyCodable: decode arbitrary JSON for tool parameters pass-through +struct AnyCodable: Decodable, Sendable { + let value: Any + init(from decoder: Decoder) throws { + let c = try decoder.singleValueContainer() + if c.decodeNil() { value = NSNull() } + else if let b = try? c.decode(Bool.self) { value = b } + else if let i = try? c.decode(Int.self) { value = i } + else if let d = try? c.decode(Double.self) { value = d } + else if let s = try? c.decode(String.self) { value = s } + else if let a = try? c.decode([AnyCodable].self) { value = a.map { $0.value } } + else if let d = try? c.decode([String: AnyCodable].self) { value = d.mapValues { $0.value } } + else { value = NSNull() } + } + // Convert back to [String: any Sendable] for ToolSpec usage + static func toSendable(_ dict: [String: AnyCodable]?) -> [String: any Sendable]? { + guard let dict else { return nil } + return dict.mapValues { $0.value as! any Sendable } + } } struct TokenUsage: Encodable {