Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions .github/workflows/build-and-publish.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,7 @@ on:
jobs:
build-and-publish:
name: Java Gradle
uses: bakdata/ci-templates/.github/workflows/java-gradle-library.yaml@1.75.0
with:
java-version: 17
uses: bakdata/ci-templates/.github/workflows/java-gradle-library.yaml@1.81.2
secrets:
sonar-token: ${{ secrets.SONARCLOUD_TOKEN }}
sonar-organization: ${{ secrets.SONARCLOUD_ORGANIZATION }}
Expand Down
3 changes: 1 addition & 2 deletions .github/workflows/release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,8 @@ on:
jobs:
java-gradle-release:
name: Java Gradle
uses: bakdata/ci-templates/.github/workflows/java-gradle-release.yaml@1.75.0
uses: bakdata/ci-templates/.github/workflows/java-gradle-release.yaml@1.81.2
with:
java-version: 17
release-type: "${{ inputs.release-type }}"
secrets:
github-email: "${{ secrets.GH_EMAIL }}"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,23 +37,23 @@ public class ClusterIdGenerators {
/**
* Returns an id generator that generates ints starting from 0.
*/
public static <T> Function<? super Iterable<? extends T>, Integer> intGenerator() {
public static <T> Function<Iterable<T>, Integer> intGenerator() {
final AtomicInteger nextId = new AtomicInteger();
return objects -> nextId.getAndIncrement();
}

/**
* Returns an id generator that generates longs starting from 0.
*/
public static <T> Function<? super Iterable<? extends T>, Long> longGenerator() {
public static <T> Function<Iterable<T>, Long> longGenerator() {
final AtomicLong nextId = new AtomicLong();
return objects -> nextId.getAndIncrement();
}

/**
* Returns an id generator that generates strings with a given prefix starting from 0.
*/
public static <T> Function<? super Iterable<? extends T>, String> stringGenerator(final String prefix) {
public static <T> Function<Iterable<T>, String> stringGenerator(final String prefix) {
final AtomicLong nextId = new AtomicLong();
return objects -> prefix + nextId.getAndIncrement();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ public class ConsistentClustering<C extends Comparable<C>, T, I extends Comparab
final T firstElement = clusters.get(0).get(0);
final List<Candidate<T>> candidates = clusters.stream()
.flatMap(cluster -> cluster.getElements().stream()
.map(record -> new OnlineCandidate<>(firstElement, record)))
.map(element -> new OnlineCandidate<>(firstElement, element)))
.collect(Collectors.toList());
final List<Cluster<C, T>> transitiveClusters = this.getInternalClosure().clusterDuplicates(candidates);
if (transitiveClusters.size() != 1) {
Expand All @@ -104,13 +104,13 @@ public class ConsistentClustering<C extends Comparable<C>, T, I extends Comparab
}

@Override
public @NonNull Function<? super Iterable<? extends I>, C> getClusterIdGenerator() {
public @NonNull Function<Iterable<I>, C> getClusterIdGenerator() {
return this.clustering.getClusterIdGenerator();
}

private boolean noRecordInIndex(final Collection<? extends Cluster<C, T>> clusters) {
final Map<I, Cluster<C, T>> clusterIndex = this.getInternalClosure().getClusterIndex();
return clusters.stream().flatMap(cluster -> cluster.getElements().stream())
.allMatch(record -> clusterIndex.get(this.idExtractor.apply(record)) == null);
.allMatch(element -> clusterIndex.get(this.idExtractor.apply(element)) == null);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,8 @@ public class OracleClustering<C extends Comparable<C>, T, I> implements Clusteri
}

@Override
public @NonNull Function<Iterable<? extends I>, C> getClusterIdGenerator() {
final Map<@NonNull Iterable<? extends I>, C> elementsToClusterId =
public @NonNull Function<Iterable<I>, C> getClusterIdGenerator() {
final Map<@NonNull Iterable<I>, C> elementsToClusterId =
this.goldClusters.stream()
.collect(Collectors.toMap(this::getElementIds, Cluster::getId));
return elementsToClusterId::get;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,11 @@ Stream<Cluster<C, T>> refine(final Stream<? extends Cluster<C, T>> clusters,
* A function to generate the id for newly split clusters.
*/
@NonNull
Function<? super Iterable<? extends I>, C> getClusterIdGenerator();
Function<Iterable<I>, C> getClusterIdGenerator();

/**
* A function to extract the id of a record.
*/
@NonNull
Function<? super T, I> getIdExtractor();
Function<T, I> getIdExtractor();
}
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,6 @@
import lombok.Value;
import lombok.With;
import lombok.experimental.FieldDefaults;
import lombok.experimental.Wither;
import org.apache.commons.lang3.tuple.Pair;


Expand Down Expand Up @@ -95,12 +94,12 @@ public class RefineClusterImpl<C extends Comparable<C>, T, I> implements RefineC
* A function to generate the id for newly split clusters.
*/
@NonNull
Function<? super Iterable<? extends I>, C> clusterIdGenerator;
Function<Iterable<I>, C> clusterIdGenerator;
/**
* A function to extract the id of a record.
*/
@NonNull
Function<? super T, I> idExtractor;
Function<T, I> idExtractor;

private static double getWeight(final ClassificationResult classificationResult) {
switch (classificationResult.getClassification()) {
Expand Down Expand Up @@ -167,7 +166,7 @@ static int triangularNumber(final int n) {
private List<ClassifiedCandidate<T>> getRelevantClassifications(final Cluster<C, ? super T> cluster,
final @NonNull Map<T, List<ClassifiedCandidate<T>>> relevantClassificationIndex) {
return cluster.getElements().stream()
.flatMap(record -> relevantClassificationIndex.getOrDefault(record, List.of()).stream()
.flatMap(element -> relevantClassificationIndex.getOrDefault(element, List.of()).stream()
.filter(classifiedCandidate -> cluster
.contains(classifiedCandidate.getCandidate().getRecord2())))
.collect(Collectors.toList());
Expand Down Expand Up @@ -297,7 +296,7 @@ private List<I> getElementIds(final Collection<? extends T> records) {
.collect(Collectors.toList());
}

private List<WeightedEdge> addRandomEdges(final @NonNull List<? extends WeightedEdge> edges,
private List<WeightedEdge> addRandomEdges(final @NonNull List<WeightedEdge> edges,
final int desiredNumEdges) {
final Set<WeightedEdge> weightedEdges = new LinkedHashSet<>(edges);
for (int distance = 2; distance < this.maxSmallClusterSize && weightedEdges.size() < desiredNumEdges;
Expand All @@ -318,7 +317,7 @@ private List<WeightedEdge> addRandomEdges(final @NonNull List<? extends Weighted
}

private List<WeightedEdge> getWeightedEdges(final @NonNull Cluster<C, ? extends T> cluster,
final List<? extends WeightedEdge> duplicates,
final List<WeightedEdge> duplicates,
final int desiredNumEdges) {
final List<WeightedEdge> edges = this.getEdges(cluster, duplicates, desiredNumEdges);

Expand All @@ -328,7 +327,7 @@ private List<WeightedEdge> getWeightedEdges(final @NonNull Cluster<C, ? extends
}

private List<WeightedEdge> getEdges(final @NonNull Cluster<C, ? extends T> cluster,
final List<? extends WeightedEdge> duplicates, final int desiredNumEdges) {
final List<WeightedEdge> duplicates, final int desiredNumEdges) {
if (duplicates.isEmpty()) {
final int n = cluster.size();
return getRandomEdges(triangularNumber(n), desiredNumEdges);
Expand Down Expand Up @@ -427,7 +426,7 @@ private boolean overlaps(final @NonNull WeightedEdge e) {

static class GreedyClustering<C extends Comparable<C>, T> {

int[] greedyCluster(final Cluster<C, T> cluster, final @NonNull Collection<? extends WeightedEdge> edges) {
int[] greedyCluster(final Cluster<C, T> cluster, final @NonNull Collection<WeightedEdge> edges) {

final Collection<WeightedEdge> queue = new PriorityQueue<>(Comparator.comparing(WeightedEdge::getWeight));
queue.addAll(edges);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ private I getClusterId(final Cluster<C, ? extends T> cluster) {
}

@Override
public @NonNull Function<? super Iterable<? extends I>, C> getClusterIdGenerator() {
public @NonNull Function<Iterable<I>, C> getClusterIdGenerator() {
return this.closure.getClusterIdGenerator();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ public class TransitiveClosure<C extends Comparable<C>, T, I extends Comparable<
* A function to generate the id for newly formed clusters.
*/
@NonNull
Function<? super Iterable<? extends I>, C> clusterIdGenerator;
Function<Iterable<I>, C> clusterIdGenerator;
/**
* A backing map for old clusters. Defaults to an in-memory map if null during construction.
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ public class AggregatingSimilarityMeasure<T> implements SimilarityMeasure<T> {
*/
@NonNull ToDoubleFunction<? super DoubleStream> aggregator;
/**
* The similarity measures that will successively applied on the input values.
* The similarity measures that will successively apply on the input values.
*/
@NonNull List<SimilarityMeasure<? super T>> similarityMeasures;

Expand Down
10 changes: 5 additions & 5 deletions core/src/main/java/com/bakdata/dedupe/clustering/Cluster.java
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,8 @@ public Cluster(final @NonNull C id) {
this(id, new ArrayList<>());
}

public void add(final @NonNull T record) {
this.elements.add(record);
public void add(final @NonNull T element) {
this.elements.add(element);
}

public int size() {
Expand All @@ -78,8 +78,8 @@ public int size() {
return this.elements.get(index);
}

public boolean contains(final @NonNull T record) {
return this.elements.contains(record);
public boolean contains(final @NonNull T element) {
return this.elements.contains(element);
}

/**
Expand All @@ -91,7 +91,7 @@ public boolean contains(final @NonNull T record) {
* @return the newly created merged cluster or this iff {@code other == this}.
*/
public @NonNull <I> Cluster<C, T> merge(
final @NonNull Function<? super Iterable<? extends I>, ? extends C> idGenerator,
final @NonNull Function<? super Iterable<I>, ? extends C> idGenerator,
final @NonNull Function<? super T, ? extends I> idExtractor,
final @NonNull Cluster<C, ? extends T> other) {
if (other == this) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,5 +51,6 @@ public interface Clustering<C extends Comparable<C>, T, I> {
*
* @return the cluster id generator.
*/
@NonNull Function<? super Iterable<? extends I>, C> getClusterIdGenerator();
@NonNull
Function<Iterable<I>, C> getClusterIdGenerator();
}
Original file line number Diff line number Diff line change
Expand Up @@ -48,11 +48,11 @@ public class Clusters {
* @throws IllegalArgumentException when there is not exactly one cluster
*/
public static <C extends Comparable<C>, T> @NonNull Cluster<C, T> getContainingCluster(
final @NonNull Iterator<? extends Cluster<C, T>> clusterIterator, final @NonNull T record) {
final @NonNull Iterator<? extends Cluster<C, T>> clusterIterator, final @NonNull T element) {
final Spliterator<Cluster<C, T>> spliterator =
Spliterators.spliteratorUnknownSize(clusterIterator, ORDERED | NONNULL);
final List<? extends Cluster<C, T>> mainClusters = StreamSupport.stream(spliterator, false)
.filter(c -> c.contains(record))
.filter(c -> c.contains(element))
.collect(Collectors.toList());
if (mainClusters.size() != 1) {
throw new IllegalArgumentException(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,18 +43,23 @@ class PersonDeduplicationTest {
private static final DateTimeFormatter BDAY_FORMAT = DateTimeFormatter.ofPattern("dd.MM.yy");

private static List<Person> parseCsv(final String resourceName) throws IOException {
final CSVFormat format = CSVFormat.newFormat('\t').withFirstRecordAsHeader().withQuote('"');
final CSVFormat format = CSVFormat.DEFAULT.builder()
.setDelimiter('\t')
.setQuote('"')
.setHeader()
.setSkipHeaderRecord(true)
.get();
try (final CSVParser parser = CSVParser
.parse(PersonDeduplicationTest.class.getResourceAsStream(resourceName), StandardCharsets.UTF_8,
format)) {
return parser.getRecords()
.stream()
.map(record -> Person.builder()
.id(record.get("id"))
.firstName(record.get("firstname_full"))
.lastName(record.get("lastname"))
.birthDate(LocalDate.parse(record.get("birthdate"), BDAY_FORMAT))
.gender(Gender.valueOf(record.get("gender").toUpperCase()))
.map(csvRecord -> Person.builder()
.id(csvRecord.get("id"))
.firstName(csvRecord.get("firstname_full"))
.lastName(csvRecord.get("lastname"))
.birthDate(LocalDate.parse(csvRecord.get("birthdate"), BDAY_FORMAT))
.gender(Gender.valueOf(csvRecord.get("gender").toUpperCase()))
.lastModified(LocalDateTime.now())
.build())
.collect(Collectors.toList());
Expand Down
6 changes: 3 additions & 3 deletions gradle/libs.versions.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ junit-jupiter = { group = "org.junit.jupiter", name = "junit-jupiter", version.r
assertj = { group = "org.assertj", name = "assertj-core", version = "3.27.7" }

[plugins]
release = { id = "com.bakdata.release", version = "2.1.0" }
sonar = { id = "com.bakdata.sonar", version = "2.1.0" }
sonatype = { id = "com.bakdata.sonatype", version = "2.1.0" }
release = { id = "com.bakdata.release", version = "2.2.0" }
sonar = { id = "com.bakdata.sonar", version = "2.2.0" }
sonatype = { id = "com.bakdata.sonatype", version = "2.2.0" }
lombok = { id = "io.freefair.lombok", version = "9.2.0" }
Loading