Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@

import { LexicalModelTypes } from '@keymanapp/common-types';

import { ContextToken } from './context-token.js';
import { ContextTokenization } from './context-tokenization.js';
import { SearchQuotientCluster } from './search-quotient-cluster.js';
import { legacySubsetKeyer, TokenizationSubset, TokenizationSubsetBuilder } from './tokenization-subsets.js';

import Distribution = LexicalModelTypes.Distribution;
Expand Down Expand Up @@ -80,11 +82,9 @@ export function precomputeTransitions(
/**
* Given results from `precomputeTransitions`, this function generates the
* context variations that should result from the current context variants and
* input. The one best matching the user's visible context will be set at index
* 0.
* input.
* @param precomputedTransitionSubsets
* @param transformDistribution
* @param keyMatchingUserContext
* @returns
*/
export function transitionTokenizations(
Expand Down Expand Up @@ -124,10 +124,61 @@ export function transitionTokenizations(
return;
}

// Of particular note - there may no longer be a specific, single set of edits
// for the transition; there will be different paths to reach a tokenization!
throw new Error("Multi-tokenization transitions not yet supported.");
finalTokenizations.set(key, mergeAlignedTokenizations(independentTransitionResults));
});

return finalTokenizations;
}

/**
* Given two or more instances of ContextTokenization, this function will
* attempt to merge each token's SearchQuotientNodes as necessary to result in a
* single instance.
*
* An error will be thrown if the instances do not sufficiently converge to the
* same tokenization pattern.
Comment on lines +138 to +139
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not seeing any error?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See SearchQuotientCluster's constructor. This comment is regarding the error that will be generated within that constructor for such cases.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if(path.inputCount != inputCount || path.codepointLength != codepointLength) {
throw new Error(`SearchQuotientNode does not share same properties as others in the cluster: inputCount ${path.inputCount} vs ${inputCount}, codepointLength ${path.codepointLength} vs ${codepointLength}`);
}
// If there's a source-range key mismatch - via mismatch in count or in actual ID, we have an error.
if(path.sourceRangeKey != sourceRangeKey) {
throw new Error(`SearchQuotientNode does not share the same source identifiers as others in the cluster`);
}

Comment on lines +138 to +139
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not seeing any error thrown in this function

* @param tokenizations
* @returns
*/
export function mergeAlignedTokenizations(tokenizations: ContextTokenization[]): ContextTokenization {
const finalizedTokens: ContextToken[] = [];

// Iterate through the token indices as long as at least one tokenization
// remains with that index.
//
// Assumption: all have at least one token.
for(
let i = 0;
tokenizations.length > 0;
// Two co-related iteration steps. Could be expressed as just one with ++i in functor conditional.
i++, tokenizations = tokenizations.filter((tokenization) => tokenization.tokens.length > i)
) {
const bucket: ContextToken[] = [];

tokenizations.forEach((tokenization) => {
// Check for matches already within the bucket.
const token = tokenization.tokens[i];

if(!bucket.find((t) => t.searchModule.isSameNode(token.searchModule))) {
bucket.push(token);
}
});

if(bucket.length == 1) {
finalizedTokens.push(bucket[0]);
} else {
const constituentSpurs = bucket.flatMap((token) => {
const quotientNode = token.searchModule;
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't really understand all the terminology here: searchModule doesn't seem to have anything to do with quotientNode?

Even this function refers to a lot of concepts which I am struggling to put together into a coherent model: tokens (vs tokenizations), buckets, nodes, spurs, [search?]modules, clusters, quotients.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I probably should do a quality pass for consistency on many of the properties, class names, and such once all is said and done. For one, some things were put in place before #15161 was developed and put in place. I haven't enforced the new terms in all of the pre-existing code.

(In regard to this specific case: quotient graphs are graphs based on "modules" of a more detailed graph, to visualize the higher-level patterns found within.)

Not aiming to dismiss your concerns here whatsoever; it's been a journey working out the terms as work proceeds, and I hope there will be sufficient time to "polish things up" in this regard before release.


if(quotientNode instanceof SearchQuotientCluster) {
return quotientNode.parents;
} else {
return quotientNode;
}
})
finalizedTokens.push(new ContextToken(new SearchQuotientCluster(constituentSpurs)));
}
}

return new ContextTokenization(finalizedTokens);
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,12 @@ import {
generateSubsetId,
LegacyQuotientRoot,
LegacyQuotientSpur,
mergeAlignedTokenizations,
models,
PathInputProperties,
precomputationSubsetKeyer,
precomputeTransitions,
SearchQuotientCluster,
TokenizationSubset,
TokenizationTransitionEdits,
TransitionEdge,
Expand Down Expand Up @@ -276,21 +278,29 @@ function generateFixturesForTransitionSource() {
);

const distrib2Subsets: Map<string, TokenizationSubset> = new Map();
const distrib2Tokenizations: Map<string, ContextTokenization> = new Map();
transitionSet2.flat().forEach((tuple) => {
const oldEntry = distrib2Subsets.get(tuple.key);
const key = tuple.key;
const oldEntry = distrib2Subsets.get(key);
if(!oldEntry) {
distrib2Subsets.set(tuple.key, tuple.subset)
distrib2Subsets.set(key, tuple.subset)
distrib2Tokenizations.set(key, tuple.transitionedTokenization);
} else {
// merge tokenizations
const combinedSubset: TokenizationSubset = {
key: oldEntry.key,
transitionEdges: new Map(oldEntry.transitionEdges)
};
const oldTokenization = distrib2Tokenizations.get(key);

tuple.subset.transitionEdges.forEach((value, key) => {
combinedSubset.transitionEdges.set(key, value);
});

distrib2Subsets.set(tuple.key, combinedSubset);
const mergedTokenization = mergeAlignedTokenizations([oldTokenization, tuple.transitionedTokenization]);

distrib2Subsets.set(key, combinedSubset);
distrib2Tokenizations.set(key, mergedTokenization);
}
});

Expand All @@ -309,7 +319,7 @@ function generateFixturesForTransitionSource() {
transitionSet: transitionSet2,
subsets: distrib2Subsets,
contextKey: transitionSet2[0][0].key,
tokenizations: null as Map<string, ContextTokenization>
tokenizations: distrib2Tokenizations
}
}
}
Expand Down Expand Up @@ -461,12 +471,29 @@ describe('transitionTokenizations', () => {
}
});

// Issue: we don't yet have converging tokenizations implemented. That's kind of a prerequisite here.
it.skip('handles the "cafe" fixture\'s second input correctly', () => {
it('handles the "cafe" fixture\'s second input correctly', () => {
const { cafe } = generateFixturesForTransitionSource();

const resultTokenizationMap = transitionTokenizations(cafe.key2.subsets, cafe.key2.dist);
const resultTokenizations = [...resultTokenizationMap.values()];

// Both source tokenizations have one spur each that converges to the same target tokenization.
assert.equal(resultTokenizations.length, 3);

const simpleTransitionTokenizations = resultTokenizations.filter((t) => !(t.tail.searchModule instanceof SearchQuotientCluster));
const complexTransitionTokenizations = resultTokenizations.filter((t) => t.tail.searchModule instanceof SearchQuotientCluster);

assert.equal(resultTokenizationMap.size, 3);
assert.equal(simpleTransitionTokenizations.length, 2);
assert.equal(complexTransitionTokenizations.length, 1);


for(let key of resultTokenizationMap.keys()) {
const msg = `Invalid assumption for tokenization with key ${key}`;

const actual = resultTokenizationMap.get(key);
const expected = cafe.key2.tokenizations.get(key);

assertMatchingTokenization(actual, expected, msg);
}
});
});
Loading