diff --git a/asterixdb/asterix-app/src/main/java/org/apache/asterix/hyracks/bootstrap/CCApplication.java b/asterixdb/asterix-app/src/main/java/org/apache/asterix/hyracks/bootstrap/CCApplication.java index 6d17fef21c0..3ec3add2338 100644 --- a/asterixdb/asterix-app/src/main/java/org/apache/asterix/hyracks/bootstrap/CCApplication.java +++ b/asterixdb/asterix-app/src/main/java/org/apache/asterix/hyracks/bootstrap/CCApplication.java @@ -41,6 +41,7 @@ import java.util.Set; import java.util.concurrent.ConcurrentMap; +import org.apache.asterix.api.http.IApiServerRegistrant; import org.apache.asterix.api.http.IQueryWebServerRegistrant; import org.apache.asterix.api.http.server.ActiveRequestsServlet; import org.apache.asterix.api.http.server.ActiveStatsApiServlet; @@ -361,6 +362,9 @@ protected HttpServer setupJSONAPIServer(ExternalProperties externalProperties) t addServlet(jsonAPIServer, Servlets.CLUSTER_STATE_CC_DETAIL); // must not precede add of CLUSTER_STATE addServlet(jsonAPIServer, Servlets.DIAGNOSTICS); addServlet(jsonAPIServer, Servlets.ACTIVE_STATS); + // Load extension servlets registered via ServiceLoader (e.g., NL2SQL++ from asterix-spidersilk) + ServiceLoader.load(IApiServerRegistrant.class) + .forEach(registrant -> registrant.register(appCtx, jsonAPIServer)); return jsonAPIServer; } diff --git a/asterixdb/asterix-common/src/main/java/org/apache/asterix/api/http/IApiServerRegistrant.java b/asterixdb/asterix-common/src/main/java/org/apache/asterix/api/http/IApiServerRegistrant.java new file mode 100644 index 00000000000..eec13e4ebe7 --- /dev/null +++ b/asterixdb/asterix-common/src/main/java/org/apache/asterix/api/http/IApiServerRegistrant.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.asterix.api.http; + +/** + * Extension point for registering servlets on the JSON API server (default port 19002). + * Implementations are discovered via {@link java.util.ServiceLoader}. + * + * To register a servlet, create an implementation of this interface and declare it in: + * {@code META-INF/services/org.apache.asterix.api.http.IApiServerRegistrant} + * + * @see IQueryWebServerRegistrant for the equivalent mechanism on the query web server (port 19006) + */ +public interface IApiServerRegistrant extends IServletRegistrant { +} diff --git a/asterixdb/asterix-common/src/main/java/org/apache/asterix/common/utils/Servlets.java b/asterixdb/asterix-common/src/main/java/org/apache/asterix/common/utils/Servlets.java index 5edc18642d3..2e51bb8ec68 100644 --- a/asterixdb/asterix-common/src/main/java/org/apache/asterix/common/utils/Servlets.java +++ b/asterixdb/asterix-common/src/main/java/org/apache/asterix/common/utils/Servlets.java @@ -23,6 +23,7 @@ public class Servlets { public static final String QUERY_STATUS = "/query/service/status/*"; public static final String QUERY_RESULT = "/query/service/result/*"; public static final String QUERY_SERVICE = "/query/service"; + public static final String NL2SQL_SERVICE = "/query/nl2sql"; public static final String CONNECTOR = "/connector"; public static final String REBALANCE = "/admin/rebalance"; public static final String SHUTDOWN = "/admin/shutdown"; diff --git a/asterixdb/asterix-spidersilk/pom.xml b/asterixdb/asterix-spidersilk/pom.xml index b41ba2d8985..e0ec2df1b77 100644 --- a/asterixdb/asterix-spidersilk/pom.xml +++ b/asterixdb/asterix-spidersilk/pom.xml @@ -21,12 +21,56 @@ asterix-spidersilk asterix-spidersilk + + ${basedir}/.. + + org.apache.asterix apache-asterixdb 0.9.10-SNAPSHOT + + + org.apache.asterix + asterix-common + ${project.version} + + + org.apache.asterix + asterix-metadata + ${project.version} + + + org.apache.asterix + asterix-om + ${project.version} + + + org.apache.hyracks + hyracks-http + + + io.netty + netty-codec-http + + + com.fasterxml.jackson.core + jackson-databind + + + org.apache.logging.log4j + log4j-api + + + + junit + junit + test + + + diff --git a/asterixdb/asterix-spidersilk/src/main/java/org/apache/asterix/spidersilk/api/INl2SqlTranslator.java b/asterixdb/asterix-spidersilk/src/main/java/org/apache/asterix/spidersilk/api/INl2SqlTranslator.java new file mode 100644 index 00000000000..3aa3e7e8ccd --- /dev/null +++ b/asterixdb/asterix-spidersilk/src/main/java/org/apache/asterix/spidersilk/api/INl2SqlTranslator.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.asterix.spidersilk.api; + +/** + * Core interface for natural language to SQL++ translation. + * + * Implementations are model-agnostic: any LLM backend (OpenAI, Ollama, etc.) + * can be used by providing a different implementation. The LangChain4j framework + * is used internally to manage LLM communication, prompt templating, and retries. + * + *

Usage example: + *

+ *   INl2SqlTranslator translator = new LangChain4jTranslator(config);
+ *   SchemaContext schema = schemaBuilder.buildContext("TinySocial");
+ *   String sqlpp = translator.translate("Find all tweets mentioning AsterixDB", schema);
+ *   // sqlpp => "SELECT VALUE t FROM TweetMessages t WHERE t.message_text LIKE '%AsterixDB%'"
+ * 
+ */ +public interface INl2SqlTranslator { + + /** + * Translates a natural language query into an executable SQL++ statement. + * + * The implementation should: + *
    + *
  1. Build a schema-aware prompt from {@code schemaContext}
  2. + *
  3. Call the configured LLM to generate a SQL++ candidate
  4. + *
  5. Validate the candidate using the AsterixDB SQL++ parser
  6. + *
  7. Retry with error feedback if validation fails (up to a configured max)
  8. + *
+ * + * @param naturalLanguage the user's natural language query (non-null, non-empty) + * @param schemaContext schema information for the target dataverse; may be + * {@code null} if no dataverse is specified + * @return a syntactically valid SQL++ query string + * @throws Nl2SqlException if translation fails after exhausting retries, + * or if the LLM service is unavailable + */ + String translate(String naturalLanguage, SchemaContext schemaContext) throws Nl2SqlException; +} diff --git a/asterixdb/asterix-spidersilk/src/main/java/org/apache/asterix/spidersilk/api/Nl2SqlException.java b/asterixdb/asterix-spidersilk/src/main/java/org/apache/asterix/spidersilk/api/Nl2SqlException.java new file mode 100644 index 00000000000..a8993e7869e --- /dev/null +++ b/asterixdb/asterix-spidersilk/src/main/java/org/apache/asterix/spidersilk/api/Nl2SqlException.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.asterix.spidersilk.api; + +/** + * Thrown when natural language to SQL++ translation fails. + * Common causes: + *
    + *
  • LLM service unavailable or misconfigured
  • + *
  • Generated SQL++ fails syntax validation after max retries
  • + *
  • Input natural language query is ambiguous or unsupported
  • + *
+ */ +public class Nl2SqlException extends Exception { + + private static final long serialVersionUID = 1L; + + public Nl2SqlException(String message) { + super(message); + } + + public Nl2SqlException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/asterixdb/asterix-spidersilk/src/main/java/org/apache/asterix/spidersilk/api/SchemaContext.java b/asterixdb/asterix-spidersilk/src/main/java/org/apache/asterix/spidersilk/api/SchemaContext.java new file mode 100644 index 00000000000..76fa0c41f70 --- /dev/null +++ b/asterixdb/asterix-spidersilk/src/main/java/org/apache/asterix/spidersilk/api/SchemaContext.java @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.asterix.spidersilk.api; + +import java.util.Collections; +import java.util.List; + +/** + * Encapsulates the database schema information extracted from AsterixDB metadata. + * This context is injected into the LLM prompt to enable schema-aware SQL++ generation. + * + * The schema is extracted from the {@code asterix-metadata} module via MetadataManager, + * including Dataset definitions, type information, and index metadata. + */ +public class SchemaContext { + + private final String dataverse; + private final List datasetDescriptions; + + public SchemaContext(String dataverse, List datasetDescriptions) { + this.dataverse = dataverse; + this.datasetDescriptions = Collections.unmodifiableList(new java.util.ArrayList<>(datasetDescriptions)); + } + + /** + * @return the target dataverse name + */ + public String getDataverse() { + return dataverse; + } + + /** + * @return human-readable schema descriptions for each dataset in the dataverse, + * formatted for inclusion in an LLM prompt + */ + public List getDatasetDescriptions() { + return datasetDescriptions; + } + + /** + * Renders the schema context as a prompt-ready string. + * Example output: + *
+     * Dataverse: TinySocial
+     * Dataset TweetMessages (tweetid: bigint, sender-location: point, text: string, ...)
+     * Dataset FacebookUsers (id: bigint, name: string, employment: [object], ...)
+     * 
+ */ + public String toPromptString() { + StringBuilder sb = new StringBuilder(); + sb.append("Dataverse: ").append(dataverse).append('\n'); + for (String desc : datasetDescriptions) { + sb.append(desc).append('\n'); + } + return sb.toString(); + } + + @Override + public String toString() { + return "SchemaContext{dataverse='" + dataverse + "', datasets=" + datasetDescriptions.size() + "}"; + } +} diff --git a/asterixdb/asterix-spidersilk/src/main/java/org/apache/asterix/spidersilk/schema/ColumnInfo.java b/asterixdb/asterix-spidersilk/src/main/java/org/apache/asterix/spidersilk/schema/ColumnInfo.java new file mode 100644 index 00000000000..6bd39823bd8 --- /dev/null +++ b/asterixdb/asterix-spidersilk/src/main/java/org/apache/asterix/spidersilk/schema/ColumnInfo.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.asterix.spidersilk.schema; + +/** + * Holds metadata for a single field (column) within a Dataset. + * + *

Instances are created by {@link SchemaContextBuilder} while traversing the + * AsterixDB ADM type tree, and consumed by: + *

    + *
  • {@link DatasetSchema#toDescriptionString()} — rendered into a prompt-ready string
  • + *
  • ColumnPruner (PR-4) — scored for relevance and potentially filtered out
  • + *
+ */ +public class ColumnInfo { + + private final String name; + private final String type; + private final boolean primaryKey; + + /** + * @param name field name as declared in the Dataset's item type + * @param type human-readable type string produced by {@link DatasetSchemaFormatter} + * @param primaryKey {@code true} if this field is part of the Dataset's primary / partitioning key + */ + public ColumnInfo(String name, String type, boolean primaryKey) { + this.name = name; + this.type = type; + this.primaryKey = primaryKey; + } + + public String getName() { + return name; + } + + public String getType() { + return type; + } + + public boolean isPrimaryKey() { + return primaryKey; + } + + /** + * Returns a compact description suitable for inclusion in an LLM prompt. + * Example: {@code "tweetid: bigint [PK]"} or {@code "message-text: string"} + */ + public String toDescriptionString() { + return primaryKey ? name + ": " + type + " [PK]" : name + ": " + type; + } + + @Override + public String toString() { + return "ColumnInfo{name='" + name + "', type='" + type + "', primaryKey=" + primaryKey + '}'; + } +} diff --git a/asterixdb/asterix-spidersilk/src/main/java/org/apache/asterix/spidersilk/schema/DatasetSchema.java b/asterixdb/asterix-spidersilk/src/main/java/org/apache/asterix/spidersilk/schema/DatasetSchema.java new file mode 100644 index 00000000000..81eb8e511aa --- /dev/null +++ b/asterixdb/asterix-spidersilk/src/main/java/org/apache/asterix/spidersilk/schema/DatasetSchema.java @@ -0,0 +1,115 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.asterix.spidersilk.schema; + +import java.util.Collections; +import java.util.List; +import java.util.Map; + +/** + * Represents the schema of a single AsterixDB Dataset, including all field + * metadata extracted from its ADM item type. + * + *

This object travels through the three-layer schema injection pipeline: + *

    + *
  1. Created by {@link SchemaContextBuilder} with the full column list
  2. + *
  3. ColumnPruner (PR-4) calls {@link #setPrunedColumns} to store a filtered subset
  4. + *
  5. ValueHintsSampler (PR-5) calls {@link #setValueHints} to attach sample values
  6. + *
  7. {@link #toDescriptionString()} is called by PromptBuilder to produce the final + * prompt fragment for this Dataset
  8. + *
+ * + *

Example output of {@link #toDescriptionString()}: + *

+ * Dataset TweetMessages (tweetid: bigint [PK], sender-location: point,
+ *     send-time: datetime, referred-topics: [string], message-text: string, author-id: bigint)
+ * 
+ */ +public class DatasetSchema { + + private final String datasetName; + private final List allColumns; + + /** Set by ColumnPruner; null until pruning is performed. */ + private List prunedColumns; + + /** Set by ValueHintsSampler; null until sampling is performed. Key = field name. */ + private Map> valueHints; + + public DatasetSchema(String datasetName, List allColumns) { + this.datasetName = datasetName; + this.allColumns = Collections.unmodifiableList(new java.util.ArrayList<>(allColumns)); + } + + public String getDatasetName() { + return datasetName; + } + + /** Returns the complete column list as extracted from the ADM item type. */ + public List getAllColumns() { + return allColumns; + } + + /** + * Returns the pruned column list if {@link #setPrunedColumns} has been called, + * otherwise falls back to the full list. + */ + public List getEffectiveColumns() { + return prunedColumns != null ? prunedColumns : allColumns; + } + + /** Called by ColumnPruner (PR-4) to store the relevance-filtered subset. */ + public void setPrunedColumns(List prunedColumns) { + this.prunedColumns = prunedColumns; + } + + /** Called by ValueHintsSampler (PR-5) to attach sample values per field. */ + public void setValueHints(Map> valueHints) { + this.valueHints = valueHints; + } + + public Map> getValueHints() { + return valueHints; + } + + /** + * Renders this Dataset's schema as a prompt-ready one-liner. + * Uses the pruned column list if available, otherwise the full column list. + * + *

Example: + *

Dataset TweetMessages (tweetid: bigint [PK], message-text: string, author-id: bigint)
+ */ + public String toDescriptionString() { + List columns = getEffectiveColumns(); + StringBuilder sb = new StringBuilder("Dataset ").append(datasetName).append(" ("); + for (int i = 0; i < columns.size(); i++) { + if (i > 0) { + sb.append(", "); + } + sb.append(columns.get(i).toDescriptionString()); + } + sb.append(')'); + return sb.toString(); + } + + @Override + public String toString() { + return "DatasetSchema{name='" + datasetName + "', columns=" + allColumns.size() + '}'; + } +} diff --git a/asterixdb/asterix-spidersilk/src/main/java/org/apache/asterix/spidersilk/schema/DatasetSchemaFormatter.java b/asterixdb/asterix-spidersilk/src/main/java/org/apache/asterix/spidersilk/schema/DatasetSchemaFormatter.java new file mode 100644 index 00000000000..bed6e90ac7e --- /dev/null +++ b/asterixdb/asterix-spidersilk/src/main/java/org/apache/asterix/spidersilk/schema/DatasetSchemaFormatter.java @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.asterix.spidersilk.schema; + +import org.apache.asterix.om.types.AOrderedListType; +import org.apache.asterix.om.types.ARecordType; +import org.apache.asterix.om.types.AUnionType; +import org.apache.asterix.om.types.AUnorderedListType; +import org.apache.asterix.om.types.IAType; + +/** + * Converts AsterixDB ADM type objects ({@link IAType}) into human-readable strings + * suitable for inclusion in an LLM prompt. + * + *

Type rendering rules: + *

    + *
  • Primitive types (bigint, string, boolean, …) → lower-cased type name
  • + *
  • {@code ARecordType} (nested object) → {@code {field1: type1, field2: type2}}
  • + *
  • {@code AOrderedListType} (ordered array) → {@code [itemType]}
  • + *
  • {@code AUnorderedListType} (bag/multiset) → {@code {{itemType}}}
  • + *
  • {@code AUnionType} (nullable/missable field) → {@code actualType?}
  • + *
+ * + *

Recursive formatting is limited to {@value #MAX_DEPTH} levels to prevent + * runaway output for deeply nested types. + */ +public class DatasetSchemaFormatter { + + private static final int MAX_DEPTH = 4; + + /** + * Formats {@code type} as a human-readable string. + * + * @param type the ADM type to format; {@code null} is rendered as {@code "any"} + * @return a compact, prompt-friendly type description + */ + public String formatType(IAType type) { + return formatType(type, 0); + } + + private String formatType(IAType type, int depth) { + if (type == null) { + return "any"; + } + if (depth >= MAX_DEPTH) { + return "object"; + } + switch (type.getTypeTag()) { + case UNION: + // Nullable or missable field: unwrap to the actual type and append '?' + return formatType(((AUnionType) type).getActualType(), depth) + "?"; + case OBJECT: + return formatRecord((ARecordType) type, depth); + case ARRAY: + // Ordered list (SQL++ array syntax: [itemType]) + return "[" + formatType(((AOrderedListType) type).getItemType(), depth + 1) + "]"; + case MULTISET: + // Unordered list / bag (SQL++ multiset syntax: {{itemType}}) + return "{{" + formatType(((AUnorderedListType) type).getItemType(), depth + 1) + "}}"; + default: + return type.getTypeName().toLowerCase(); + } + } + + /** + * Formats a record type as {@code {field1: type1, field2: type2}}. + * For top-level fields of a Dataset (depth 0), the outer braces are omitted + * because the field list is already wrapped by the Dataset description. + */ + private String formatRecord(ARecordType recordType, int depth) { + String[] fieldNames = recordType.getFieldNames(); + IAType[] fieldTypes = recordType.getFieldTypes(); + if (fieldNames.length == 0) { + return "object"; + } + StringBuilder sb = new StringBuilder(); + boolean wrapWithBraces = depth > 0; + if (wrapWithBraces) { + sb.append('{'); + } + for (int i = 0; i < fieldNames.length; i++) { + if (i > 0) { + sb.append(", "); + } + sb.append(fieldNames[i]).append(": ").append(formatType(fieldTypes[i], depth + 1)); + } + if (wrapWithBraces) { + sb.append('}'); + } + return sb.toString(); + } +} diff --git a/asterixdb/asterix-spidersilk/src/main/java/org/apache/asterix/spidersilk/schema/SchemaContextBuilder.java b/asterixdb/asterix-spidersilk/src/main/java/org/apache/asterix/spidersilk/schema/SchemaContextBuilder.java new file mode 100644 index 00000000000..0f0ab4dc3fe --- /dev/null +++ b/asterixdb/asterix-spidersilk/src/main/java/org/apache/asterix/spidersilk/schema/SchemaContextBuilder.java @@ -0,0 +1,180 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.asterix.spidersilk.schema; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.apache.asterix.common.metadata.DataverseName; +import org.apache.asterix.common.metadata.MetadataConstants; +import org.apache.asterix.metadata.MetadataManager; +import org.apache.asterix.metadata.MetadataTransactionContext; +import org.apache.asterix.metadata.entities.Dataset; +import org.apache.asterix.om.types.ARecordType; +import org.apache.asterix.om.types.IAType; +import org.apache.asterix.spidersilk.api.Nl2SqlException; +import org.apache.asterix.spidersilk.api.SchemaContext; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +/** + * Builds a {@link SchemaContext} by reading Dataset and type metadata from + * AsterixDB's {@link MetadataManager}. + * + *

For each Dataset in the target Dataverse, this class: + *

    + *
  1. Fetches the ADM item type via {@code MetadataManager.INSTANCE.getDatatype()}
  2. + *
  3. Recursively formats the type tree into a human-readable field list using + * {@link DatasetSchemaFormatter}
  4. + *
  5. Marks primary-key fields from {@code Dataset.getPrimaryKeys()}
  6. + *
  7. Returns a {@link SchemaContext} whose {@code datasetDescriptions} list is + * ready to be consumed by {@link SchemaEmbeddingService} (PR-3)
  8. + *
+ * + *

All metadata reads are wrapped in a single metadata transaction that is + * committed on success and aborted on any failure, following the standard + * AsterixDB metadata access pattern. + * + *

Example output description for one Dataset: + *

+ * Dataset TweetMessages (tweetid: bigint [PK], sender-location: point,
+ *     send-time: datetime, referred-topics: [string], message-text: string, author-id: bigint)
+ * 
+ */ +public class SchemaContextBuilder { + + private static final Logger LOGGER = LogManager.getLogger(); + + private final DatasetSchemaFormatter formatter; + private final String databaseName; + + /** + * Creates a builder that reads from the default AsterixDB database + * ({@code MetadataConstants.DEFAULT_DATABASE}). + */ + public SchemaContextBuilder() { + this(MetadataConstants.DEFAULT_DATABASE); + } + + /** + * Creates a builder that reads from the specified database. + * + * @param databaseName the AsterixDB database name (typically {@code "Default"}) + */ + public SchemaContextBuilder(String databaseName) { + this.databaseName = databaseName; + this.formatter = new DatasetSchemaFormatter(); + } + + /** + * Builds a {@link SchemaContext} containing descriptions for all Datasets + * in the given Dataverse. + * + * @param dataverse the target Dataverse name (e.g. {@code "TinySocial"}) + * @return a populated {@link SchemaContext} ready for embedding and prompt injection + * @throws Nl2SqlException if the Dataverse does not exist or metadata access fails + */ + public SchemaContext build(String dataverse) throws Nl2SqlException { + MetadataTransactionContext txnCtx = null; + try { + DataverseName dataverseName = DataverseName.createSinglePartName(dataverse); + txnCtx = MetadataManager.INSTANCE.beginTransaction(); + + List datasets = MetadataManager.INSTANCE.getDataverseDatasets(txnCtx, databaseName, dataverseName); + + if (datasets.isEmpty()) { + LOGGER.warn("No datasets found in dataverse '{}'", dataverse); + } + + List descriptions = new ArrayList<>(datasets.size()); + for (Dataset dataset : datasets) { + try { + DatasetSchema schema = buildDatasetSchema(txnCtx, dataset); + descriptions.add(schema.toDescriptionString()); + } catch (Exception e) { + // Skip datasets whose type cannot be resolved rather than failing the whole request + LOGGER.warn("Skipping dataset '{}': failed to resolve type — {}", dataset.getDatasetName(), + e.getMessage()); + } + } + + MetadataManager.INSTANCE.commitTransaction(txnCtx); + txnCtx = null; + return new SchemaContext(dataverse, descriptions); + + } catch (Exception e) { + throw new Nl2SqlException("Failed to build schema context for dataverse: " + dataverse, e); + } finally { + if (txnCtx != null) { + try { + MetadataManager.INSTANCE.abortTransaction(txnCtx); + } catch (Exception ignored) { + LOGGER.warn("Failed to abort metadata transaction", ignored); + } + } + } + } + + /** + * Builds a {@link DatasetSchema} for a single Dataset by resolving its item type + * and extracting field metadata. + */ + private DatasetSchema buildDatasetSchema(MetadataTransactionContext txnCtx, Dataset dataset) throws Exception { + // Collect primary-key field names for annotation + Set primaryKeyFields = extractPrimaryKeyFields(dataset); + + // Resolve the ADM item type + IAType itemType = MetadataManager.INSTANCE.getDatatype(txnCtx, dataset.getItemTypeDatabaseName(), + dataset.getItemTypeDataverseName(), dataset.getItemTypeName()).getDatatype(); + + List columns = new ArrayList<>(); + if (itemType instanceof ARecordType) { + ARecordType recordType = (ARecordType) itemType; + String[] fieldNames = recordType.getFieldNames(); + IAType[] fieldTypes = recordType.getFieldTypes(); + for (int i = 0; i < fieldNames.length; i++) { + String typeStr = formatter.formatType(fieldTypes[i]); + boolean isPk = primaryKeyFields.contains(fieldNames[i]); + columns.add(new ColumnInfo(fieldNames[i], typeStr, isPk)); + } + } else { + LOGGER.debug("Dataset '{}' has non-record item type: {}", dataset.getDatasetName(), itemType.getTypeTag()); + } + + return new DatasetSchema(dataset.getDatasetName(), columns); + } + + /** + * Returns the set of top-level field names that form the primary key. + * For composite keys only the first part of each key path is collected, + * which is sufficient for annotation purposes in prompt generation. + */ + private Set extractPrimaryKeyFields(Dataset dataset) { + Set pkFields = new HashSet<>(); + List> primaryKeys = dataset.getPrimaryKeys(); + for (List keyPath : primaryKeys) { + if (!keyPath.isEmpty()) { + pkFields.add(keyPath.get(0)); + } + } + return pkFields; + } +} diff --git a/asterixdb/asterix-spidersilk/src/main/java/org/apache/asterix/spidersilk/servlet/NL2SqlServlet.java b/asterixdb/asterix-spidersilk/src/main/java/org/apache/asterix/spidersilk/servlet/NL2SqlServlet.java new file mode 100644 index 00000000000..6e748c3de22 --- /dev/null +++ b/asterixdb/asterix-spidersilk/src/main/java/org/apache/asterix/spidersilk/servlet/NL2SqlServlet.java @@ -0,0 +1,141 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.asterix.spidersilk.servlet; + +import java.io.IOException; +import java.io.PrintWriter; +import java.util.concurrent.ConcurrentMap; + +import org.apache.asterix.spidersilk.api.INl2SqlTranslator; +import org.apache.asterix.spidersilk.api.Nl2SqlException; +import org.apache.asterix.spidersilk.api.SchemaContext; +import org.apache.hyracks.http.api.IServletRequest; +import org.apache.hyracks.http.api.IServletResponse; +import org.apache.hyracks.http.server.AbstractServlet; +import org.apache.hyracks.http.server.utils.HttpUtil; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.node.ObjectNode; + +import io.netty.handler.codec.http.HttpResponseStatus; + +/** + * HTTP servlet exposing the NL2SQL++ translation API on the JSON API server. + * + *

Endpoint: {@code POST /query/nl2sql} + * + *

Request parameters (form or JSON body): + *

    + *
  • {@code statement} (required) — the natural language query
  • + *
  • {@code dataverse} (optional) — target dataverse for schema context
  • + *
+ * + *

Response (JSON): + *

+ * {
+ *   "sqlpp":   "SELECT VALUE t FROM TweetMessages t WHERE ...",
+ *   "status":  "success"
+ * }
+ * 
+ * + *

When the {@code INl2SqlTranslator} implementation is not yet available, + * the endpoint returns HTTP 501 (Not Implemented) with an informative message, + * allowing the servlet to be registered and tested without a live LLM backend. + */ +public class NL2SqlServlet extends AbstractServlet { + + private static final Logger LOGGER = LogManager.getLogger(); + protected static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + /** Request parameter name for the natural language query. */ + public static final String PARAM_STATEMENT = "statement"; + /** Optional request parameter specifying the target dataverse. */ + public static final String PARAM_DATAVERSE = "dataverse"; + + /** + * The translator is injected at construction time and may be {@code null} + * until a concrete LLM implementation is provided (Phase 2 of development). + */ + private final INl2SqlTranslator translator; + + public NL2SqlServlet(ConcurrentMap ctx, String[] paths, INl2SqlTranslator translator) { + super(ctx, paths); + this.translator = translator; + } + + @Override + protected void post(IServletRequest request, IServletResponse response) throws IOException { + String naturalLanguage = request.getParameter(PARAM_STATEMENT); + String dataverse = request.getParameter(PARAM_DATAVERSE); + + if (naturalLanguage == null || naturalLanguage.isBlank()) { + sendError(request, response, HttpResponseStatus.BAD_REQUEST, "Parameter 'statement' is required."); + return; + } + + if (translator == null) { + sendError(request, response, HttpResponseStatus.NOT_IMPLEMENTED, + "NL2SQL++ translator is not yet configured. " + + "Set nl2sql.model.type and related properties in cc.conf and restart the server."); + return; + } + + try { + // Build schema context from metadata if a dataverse is provided. + // SchemaContextBuilder integration will be added in the next phase. + SchemaContext schemaContext = dataverse != null ? new SchemaContext(dataverse, java.util.List.of()) : null; + + String sqlpp = translator.translate(naturalLanguage, schemaContext); + + HttpUtil.setContentType(response, HttpUtil.ContentType.APPLICATION_JSON, request); + response.setStatus(HttpResponseStatus.OK); + + ObjectNode result = OBJECT_MAPPER.createObjectNode(); + result.put("sqlpp", sqlpp); + result.put("status", "success"); + + PrintWriter writer = response.writer(); + writer.write(result.toString()); + writer.flush(); + + } catch (Nl2SqlException e) { + LOGGER.warn("NL2SQL translation failed for query: {}", naturalLanguage, e); + sendError(request, response, HttpResponseStatus.INTERNAL_SERVER_ERROR, e.getMessage()); + } + } + + @Override + protected void get(IServletRequest request, IServletResponse response) throws IOException { + sendError(request, response, HttpResponseStatus.METHOD_NOT_ALLOWED, "Use POST with parameter 'statement'."); + } + + private void sendError(IServletRequest request, IServletResponse response, HttpResponseStatus status, + String message) throws IOException { + HttpUtil.setContentType(response, HttpUtil.ContentType.APPLICATION_JSON, request); + response.setStatus(status); + ObjectNode error = OBJECT_MAPPER.createObjectNode(); + error.put("status", "error"); + error.put("message", message); + PrintWriter writer = response.writer(); + writer.write(error.toString()); + writer.flush(); + } +} diff --git a/asterixdb/asterix-spidersilk/src/main/java/org/apache/asterix/spidersilk/servlet/NL2SqlServletRegistrant.java b/asterixdb/asterix-spidersilk/src/main/java/org/apache/asterix/spidersilk/servlet/NL2SqlServletRegistrant.java new file mode 100644 index 00000000000..9f0e1a74559 --- /dev/null +++ b/asterixdb/asterix-spidersilk/src/main/java/org/apache/asterix/spidersilk/servlet/NL2SqlServletRegistrant.java @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.asterix.spidersilk.servlet; + +import org.apache.asterix.api.http.IApiServerRegistrant; +import org.apache.asterix.common.dataflow.ICcApplicationContext; +import org.apache.asterix.common.utils.Servlets; +import org.apache.hyracks.http.server.HttpServer; + +/** + * Registers the {@link NL2SqlServlet} on the JSON API server via the + * {@link IApiServerRegistrant} ServiceLoader extension point. + * + * This class is discovered automatically at runtime through: + * {@code META-INF/services/org.apache.asterix.api.http.IApiServerRegistrant} + * + * No modification to {@code CCApplication.java} is required beyond the + * one-time addition of the ServiceLoader call in {@code setupJSONAPIServer()}. + */ +public class NL2SqlServletRegistrant implements IApiServerRegistrant { + + @Override + public void register(ICcApplicationContext appCtx, HttpServer apiServer) { + // The translator is null here; it will be initialized from configuration + // in a follow-up phase when LangChain4j integration is added. + apiServer.addServlet(new NL2SqlServlet(apiServer.ctx(), new String[] { Servlets.NL2SQL_SERVICE }, null)); + } +} diff --git a/asterixdb/asterix-spidersilk/src/main/resources/META-INF/services/org.apache.asterix.api.http.IApiServerRegistrant b/asterixdb/asterix-spidersilk/src/main/resources/META-INF/services/org.apache.asterix.api.http.IApiServerRegistrant new file mode 100644 index 00000000000..0a4c6a71fde --- /dev/null +++ b/asterixdb/asterix-spidersilk/src/main/resources/META-INF/services/org.apache.asterix.api.http.IApiServerRegistrant @@ -0,0 +1,19 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +org.apache.asterix.spidersilk.servlet.NL2SqlServletRegistrant diff --git a/asterixdb/asterix-spidersilk/src/test/java/org/apache/asterix/spidersilk/NL2SqlServletTest.java b/asterixdb/asterix-spidersilk/src/test/java/org/apache/asterix/spidersilk/NL2SqlServletTest.java new file mode 100644 index 00000000000..9ee95c7cedd --- /dev/null +++ b/asterixdb/asterix-spidersilk/src/test/java/org/apache/asterix/spidersilk/NL2SqlServletTest.java @@ -0,0 +1,99 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.asterix.spidersilk; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.apache.asterix.spidersilk.api.INl2SqlTranslator; +import org.apache.asterix.spidersilk.api.Nl2SqlException; +import org.apache.asterix.spidersilk.api.SchemaContext; +import org.junit.Assert; +import org.junit.Test; + +/** + * Unit tests for the NL2SQL++ module skeleton. + * + * These tests verify the core API contracts without requiring a running AsterixDB + * instance or a live LLM service. Full integration tests will be added in Phase 2 + * when LangChain4j translation is implemented. + */ +public class NL2SqlServletTest { + + @Test + public void testSchemaContextToPromptString() { + SchemaContext ctx = + new SchemaContext("TinySocial", Arrays.asList("Dataset TweetMessages (tweetid: bigint, text: string)", + "Dataset FacebookUsers (id: bigint, name: string)")); + + String prompt = ctx.toPromptString(); + + Assert.assertTrue("Prompt should contain dataverse name", prompt.contains("TinySocial")); + Assert.assertTrue("Prompt should contain TweetMessages dataset", prompt.contains("TweetMessages")); + Assert.assertTrue("Prompt should contain FacebookUsers dataset", prompt.contains("FacebookUsers")); + } + + @Test + public void testSchemaContextImmutable() { + List descriptions = new ArrayList<>(); + descriptions.add("Dataset Foo (id: bigint)"); + SchemaContext ctx = new SchemaContext("TestDV", descriptions); + + // Modifying the original list should not affect the SchemaContext + descriptions.add("Dataset Bar (id: bigint)"); + + Assert.assertEquals("SchemaContext should hold an immutable copy of the descriptions", 1, + ctx.getDatasetDescriptions().size()); + } + + @Test + public void testNl2SqlExceptionMessage() { + Nl2SqlException ex = new Nl2SqlException("LLM service unavailable"); + Assert.assertEquals("LLM service unavailable", ex.getMessage()); + } + + @Test + public void testNl2SqlExceptionWithCause() { + RuntimeException cause = new RuntimeException("connection refused"); + Nl2SqlException ex = new Nl2SqlException("Translation failed", cause); + + Assert.assertEquals("Translation failed", ex.getMessage()); + Assert.assertSame(cause, ex.getCause()); + } + + /** + * Verifies that a mock implementation of INl2SqlTranslator correctly + * returns a SQL++ string. This ensures the interface contract is stable. + */ + @Test + public void testTranslatorInterfaceContract() throws Nl2SqlException { + INl2SqlTranslator mockTranslator = + (nl, schema) -> "SELECT VALUE t FROM TweetMessages t WHERE t.text LIKE '%" + nl + "%'"; + + SchemaContext ctx = + new SchemaContext("TinySocial", Arrays.asList("Dataset TweetMessages (tweetid: bigint, text: string)")); + + String result = mockTranslator.translate("AsterixDB", ctx); + + Assert.assertNotNull("Translator must return a non-null SQL++ string", result); + Assert.assertTrue("Result should reference the dataset", result.contains("TweetMessages")); + Assert.assertTrue("Result should be a SELECT statement", result.startsWith("SELECT")); + } +} diff --git a/asterixdb/asterix-spidersilk/src/test/java/org/apache/asterix/spidersilk/schema/SchemaContextBuilderTest.java b/asterixdb/asterix-spidersilk/src/test/java/org/apache/asterix/spidersilk/schema/SchemaContextBuilderTest.java new file mode 100644 index 00000000000..5bbf76743fa --- /dev/null +++ b/asterixdb/asterix-spidersilk/src/test/java/org/apache/asterix/spidersilk/schema/SchemaContextBuilderTest.java @@ -0,0 +1,197 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.asterix.spidersilk.schema; + +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +import org.apache.asterix.om.types.AOrderedListType; +import org.apache.asterix.om.types.ARecordType; +import org.apache.asterix.om.types.AUnionType; +import org.apache.asterix.om.types.BuiltinType; +import org.apache.asterix.om.types.IAType; +import org.junit.Assert; +import org.junit.Test; + +/** + * Unit tests for the PR-2 schema extraction components. + * + * These tests exercise {@link DatasetSchemaFormatter} and {@link DatasetSchema} + * using ADM type objects constructed directly in-memory, with no dependency on + * a running AsterixDB instance or MetadataManager. + * + * Integration tests that verify the full {@link SchemaContextBuilder#build(String)} + * path against a live AsterixDB + TinySocial dataset are left for the integration + * test suite (require a running cluster). + */ +public class SchemaContextBuilderTest { + + private final DatasetSchemaFormatter formatter = new DatasetSchemaFormatter(); + + // ------------------------------------------------------------------------- + // DatasetSchemaFormatter tests + // ------------------------------------------------------------------------- + + @Test + public void testFormatPrimitiveTypes() { + Assert.assertEquals("int64", formatter.formatType(BuiltinType.AINT64)); + Assert.assertEquals("string", formatter.formatType(BuiltinType.ASTRING)); + Assert.assertEquals("boolean", formatter.formatType(BuiltinType.ABOOLEAN)); + Assert.assertEquals("double", formatter.formatType(BuiltinType.ADOUBLE)); + } + + @Test + public void testFormatNullType() { + Assert.assertEquals("any", formatter.formatType(null)); + } + + @Test + public void testFormatOrderedList() { + // [string] — ordered list of strings (SQL++ array) + AOrderedListType listType = new AOrderedListType(BuiltinType.ASTRING, "string-list"); + String result = formatter.formatType(listType); + Assert.assertEquals("[string]", result); + } + + @Test + public void testFormatNullableField() { + // string? — union of string + missing (nullable field) + AUnionType unionType = + new AUnionType(Arrays.asList(BuiltinType.ASTRING, BuiltinType.AMISSING), "nullable-string"); + String result = formatter.formatType(unionType); + Assert.assertEquals("string?", result); + } + + @Test + public void testFormatNestedRecord() { + // Nested record: { street: string, city: string } + ARecordType addressType = new ARecordType("AddressType", new String[] { "street", "city" }, + new IAType[] { BuiltinType.ASTRING, BuiltinType.ASTRING }, false); + + // Top-level record with a nested field + ARecordType personType = new ARecordType("PersonType", new String[] { "name", "address" }, + new IAType[] { BuiltinType.ASTRING, addressType }, false); + + // formatType on the top-level record (depth=0) should not wrap in braces + String result = formatter.formatType(personType); + Assert.assertTrue("Should contain nested field 'address'", result.contains("address")); + Assert.assertTrue("Should contain nested field 'street'", result.contains("street")); + Assert.assertTrue("Should contain nested field 'city'", result.contains("city")); + } + + @Test + public void testFormatTweetMessagesSchema() { + // Mimics the TinySocial TweetMessages item type + AOrderedListType topicsType = new AOrderedListType(BuiltinType.ASTRING, "topics-list"); + ARecordType tweetType = new ARecordType("TweetMessageType", + new String[] { "tweetid", "sender-location", "send-time", "referred-topics", "message-text", + "author-id" }, + new IAType[] { BuiltinType.AINT64, BuiltinType.ANY, BuiltinType.ADATETIME, topicsType, + BuiltinType.ASTRING, BuiltinType.AINT64 }, + false); + + String result = formatter.formatType(tweetType); + Assert.assertTrue(result.contains("tweetid")); + Assert.assertTrue(result.contains("int64")); + Assert.assertTrue(result.contains("message-text")); + Assert.assertTrue(result.contains("referred-topics")); + Assert.assertTrue(result.contains("[string]")); + } + + // ------------------------------------------------------------------------- + // ColumnInfo tests + // ------------------------------------------------------------------------- + + @Test + public void testColumnInfoPrimaryKeyDescription() { + ColumnInfo pk = new ColumnInfo("tweetid", "bigint", true); + Assert.assertEquals("tweetid: bigint [PK]", pk.toDescriptionString()); + } + + @Test + public void testColumnInfoNonPrimaryKeyDescription() { + ColumnInfo col = new ColumnInfo("message-text", "string", false); + Assert.assertEquals("message-text: string", col.toDescriptionString()); + } + + // ------------------------------------------------------------------------- + // DatasetSchema tests + // ------------------------------------------------------------------------- + + @Test + public void testDatasetSchemaDescriptionString() { + List columns = Arrays.asList(new ColumnInfo("tweetid", "int64", true), + new ColumnInfo("message-text", "string", false), new ColumnInfo("author-id", "int64", false)); + + DatasetSchema schema = new DatasetSchema("TweetMessages", columns); + String desc = schema.toDescriptionString(); + + Assert.assertTrue(desc.startsWith("Dataset TweetMessages (")); + Assert.assertTrue(desc.contains("tweetid: int64 [PK]")); + Assert.assertTrue(desc.contains("message-text: string")); + Assert.assertTrue(desc.endsWith(")")); + } + + @Test + public void testDatasetSchemaFallsBackToAllColumnsBeforePruning() { + List columns = + Arrays.asList(new ColumnInfo("id", "bigint", true), new ColumnInfo("name", "string", false)); + + DatasetSchema schema = new DatasetSchema("Users", columns); + + // Before pruning, getEffectiveColumns() returns the full list + Assert.assertEquals(2, schema.getEffectiveColumns().size()); + } + + @Test + public void testDatasetSchemaUsesPrunedColumnsAfterPruning() { + List allColumns = Arrays.asList(new ColumnInfo("id", "bigint", true), + new ColumnInfo("name", "string", false), new ColumnInfo("created-at", "datetime", false)); + + DatasetSchema schema = new DatasetSchema("Users", allColumns); + + // Simulate ColumnPruner keeping only id and name + List pruned = + Arrays.asList(new ColumnInfo("id", "bigint", true), new ColumnInfo("name", "string", false)); + schema.setPrunedColumns(pruned); + + Assert.assertEquals(2, schema.getEffectiveColumns().size()); + String desc = schema.toDescriptionString(); + Assert.assertFalse("Pruned field should not appear", desc.contains("created-at")); + } + + @Test + public void testDatasetSchemaImmutableAllColumns() { + List mutable = new java.util.ArrayList<>(); + mutable.add(new ColumnInfo("id", "bigint", true)); + DatasetSchema schema = new DatasetSchema("Foo", mutable); + + // Modifying original list must not affect the schema + mutable.add(new ColumnInfo("extra", "string", false)); + Assert.assertEquals(1, schema.getAllColumns().size()); + } + + @Test + public void testEmptyDatasetDescription() { + DatasetSchema schema = new DatasetSchema("EmptyDataset", Collections.emptyList()); + String desc = schema.toDescriptionString(); + Assert.assertEquals("Dataset EmptyDataset ()", desc); + } +}