diff --git a/.github/workflows/build-and-publish.yaml b/.github/workflows/build-and-publish.yaml index 6bf209c..72ac677 100644 --- a/.github/workflows/build-and-publish.yaml +++ b/.github/workflows/build-and-publish.yaml @@ -10,9 +10,7 @@ on: jobs: build-and-publish: name: Java Gradle - uses: bakdata/ci-templates/.github/workflows/java-gradle-library.yaml@1.75.0 - with: - java-version: 17 + uses: bakdata/ci-templates/.github/workflows/java-gradle-library.yaml@1.81.2 secrets: sonar-token: ${{ secrets.SONARCLOUD_TOKEN }} sonar-organization: ${{ secrets.SONARCLOUD_ORGANIZATION }} diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index c4adac7..145e563 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -16,9 +16,8 @@ on: jobs: java-gradle-release: name: Java Gradle - uses: bakdata/ci-templates/.github/workflows/java-gradle-release.yaml@1.75.0 + uses: bakdata/ci-templates/.github/workflows/java-gradle-release.yaml@1.81.2 with: - java-version: 17 release-type: "${{ inputs.release-type }}" secrets: github-email: "${{ secrets.GH_EMAIL }}" diff --git a/common/src/main/java/com/bakdata/dedupe/clustering/ClusterIdGenerators.java b/common/src/main/java/com/bakdata/dedupe/clustering/ClusterIdGenerators.java index 41cf7ad..379545d 100644 --- a/common/src/main/java/com/bakdata/dedupe/clustering/ClusterIdGenerators.java +++ b/common/src/main/java/com/bakdata/dedupe/clustering/ClusterIdGenerators.java @@ -37,7 +37,7 @@ public class ClusterIdGenerators { /** * Returns an id generator that generates ints starting from 0. */ - public static Function, Integer> intGenerator() { + public static Function, Integer> intGenerator() { final AtomicInteger nextId = new AtomicInteger(); return objects -> nextId.getAndIncrement(); } @@ -45,7 +45,7 @@ public static Function, Integer> intGenerator( /** * Returns an id generator that generates longs starting from 0. */ - public static Function, Long> longGenerator() { + public static Function, Long> longGenerator() { final AtomicLong nextId = new AtomicLong(); return objects -> nextId.getAndIncrement(); } @@ -53,7 +53,7 @@ public static Function, Long> longGenerator() /** * Returns an id generator that generates strings with a given prefix starting from 0. */ - public static Function, String> stringGenerator(final String prefix) { + public static Function, String> stringGenerator(final String prefix) { final AtomicLong nextId = new AtomicLong(); return objects -> prefix + nextId.getAndIncrement(); } diff --git a/common/src/main/java/com/bakdata/dedupe/clustering/ConsistentClustering.java b/common/src/main/java/com/bakdata/dedupe/clustering/ConsistentClustering.java index 4ad7e68..4805bd2 100644 --- a/common/src/main/java/com/bakdata/dedupe/clustering/ConsistentClustering.java +++ b/common/src/main/java/com/bakdata/dedupe/clustering/ConsistentClustering.java @@ -90,7 +90,7 @@ public class ConsistentClustering, T, I extends Comparab final T firstElement = clusters.get(0).get(0); final List> candidates = clusters.stream() .flatMap(cluster -> cluster.getElements().stream() - .map(record -> new OnlineCandidate<>(firstElement, record))) + .map(element -> new OnlineCandidate<>(firstElement, element))) .collect(Collectors.toList()); final List> transitiveClusters = this.getInternalClosure().clusterDuplicates(candidates); if (transitiveClusters.size() != 1) { @@ -104,13 +104,13 @@ public class ConsistentClustering, T, I extends Comparab } @Override - public @NonNull Function, C> getClusterIdGenerator() { + public @NonNull Function, C> getClusterIdGenerator() { return this.clustering.getClusterIdGenerator(); } private boolean noRecordInIndex(final Collection> clusters) { final Map> clusterIndex = this.getInternalClosure().getClusterIndex(); return clusters.stream().flatMap(cluster -> cluster.getElements().stream()) - .allMatch(record -> clusterIndex.get(this.idExtractor.apply(record)) == null); + .allMatch(element -> clusterIndex.get(this.idExtractor.apply(element)) == null); } } diff --git a/common/src/main/java/com/bakdata/dedupe/clustering/OracleClustering.java b/common/src/main/java/com/bakdata/dedupe/clustering/OracleClustering.java index a520912..68e736b 100644 --- a/common/src/main/java/com/bakdata/dedupe/clustering/OracleClustering.java +++ b/common/src/main/java/com/bakdata/dedupe/clustering/OracleClustering.java @@ -87,8 +87,8 @@ public class OracleClustering, T, I> implements Clusteri } @Override - public @NonNull Function, C> getClusterIdGenerator() { - final Map<@NonNull Iterable, C> elementsToClusterId = + public @NonNull Function, C> getClusterIdGenerator() { + final Map<@NonNull Iterable, C> elementsToClusterId = this.goldClusters.stream() .collect(Collectors.toMap(this::getElementIds, Cluster::getId)); return elementsToClusterId::get; diff --git a/common/src/main/java/com/bakdata/dedupe/clustering/RefineCluster.java b/common/src/main/java/com/bakdata/dedupe/clustering/RefineCluster.java index b27a6a1..af0b544 100644 --- a/common/src/main/java/com/bakdata/dedupe/clustering/RefineCluster.java +++ b/common/src/main/java/com/bakdata/dedupe/clustering/RefineCluster.java @@ -43,11 +43,11 @@ Stream> refine(final Stream> clusters, * A function to generate the id for newly split clusters. */ @NonNull - Function, C> getClusterIdGenerator(); + Function, C> getClusterIdGenerator(); /** * A function to extract the id of a record. */ @NonNull - Function getIdExtractor(); + Function getIdExtractor(); } diff --git a/common/src/main/java/com/bakdata/dedupe/clustering/RefineClusterImpl.java b/common/src/main/java/com/bakdata/dedupe/clustering/RefineClusterImpl.java index ef4a75c..2cec790 100644 --- a/common/src/main/java/com/bakdata/dedupe/clustering/RefineClusterImpl.java +++ b/common/src/main/java/com/bakdata/dedupe/clustering/RefineClusterImpl.java @@ -56,7 +56,6 @@ import lombok.Value; import lombok.With; import lombok.experimental.FieldDefaults; -import lombok.experimental.Wither; import org.apache.commons.lang3.tuple.Pair; @@ -95,12 +94,12 @@ public class RefineClusterImpl, T, I> implements RefineC * A function to generate the id for newly split clusters. */ @NonNull - Function, C> clusterIdGenerator; + Function, C> clusterIdGenerator; /** * A function to extract the id of a record. */ @NonNull - Function idExtractor; + Function idExtractor; private static double getWeight(final ClassificationResult classificationResult) { switch (classificationResult.getClassification()) { @@ -167,7 +166,7 @@ static int triangularNumber(final int n) { private List> getRelevantClassifications(final Cluster cluster, final @NonNull Map>> relevantClassificationIndex) { return cluster.getElements().stream() - .flatMap(record -> relevantClassificationIndex.getOrDefault(record, List.of()).stream() + .flatMap(element -> relevantClassificationIndex.getOrDefault(element, List.of()).stream() .filter(classifiedCandidate -> cluster .contains(classifiedCandidate.getCandidate().getRecord2()))) .collect(Collectors.toList()); @@ -297,7 +296,7 @@ private List getElementIds(final Collection records) { .collect(Collectors.toList()); } - private List addRandomEdges(final @NonNull List edges, + private List addRandomEdges(final @NonNull List edges, final int desiredNumEdges) { final Set weightedEdges = new LinkedHashSet<>(edges); for (int distance = 2; distance < this.maxSmallClusterSize && weightedEdges.size() < desiredNumEdges; @@ -318,7 +317,7 @@ private List addRandomEdges(final @NonNull List getWeightedEdges(final @NonNull Cluster cluster, - final List duplicates, + final List duplicates, final int desiredNumEdges) { final List edges = this.getEdges(cluster, duplicates, desiredNumEdges); @@ -328,7 +327,7 @@ private List getWeightedEdges(final @NonNull Cluster getEdges(final @NonNull Cluster cluster, - final List duplicates, final int desiredNumEdges) { + final List duplicates, final int desiredNumEdges) { if (duplicates.isEmpty()) { final int n = cluster.size(); return getRandomEdges(triangularNumber(n), desiredNumEdges); @@ -427,7 +426,7 @@ private boolean overlaps(final @NonNull WeightedEdge e) { static class GreedyClustering, T> { - int[] greedyCluster(final Cluster cluster, final @NonNull Collection edges) { + int[] greedyCluster(final Cluster cluster, final @NonNull Collection edges) { final Collection queue = new PriorityQueue<>(Comparator.comparing(WeightedEdge::getWeight)); queue.addAll(edges); diff --git a/common/src/main/java/com/bakdata/dedupe/clustering/RefinedTransitiveClosure.java b/common/src/main/java/com/bakdata/dedupe/clustering/RefinedTransitiveClosure.java index da20c9c..66c8693 100644 --- a/common/src/main/java/com/bakdata/dedupe/clustering/RefinedTransitiveClosure.java +++ b/common/src/main/java/com/bakdata/dedupe/clustering/RefinedTransitiveClosure.java @@ -121,7 +121,7 @@ private I getClusterId(final Cluster cluster) { } @Override - public @NonNull Function, C> getClusterIdGenerator() { + public @NonNull Function, C> getClusterIdGenerator() { return this.closure.getClusterIdGenerator(); } diff --git a/common/src/main/java/com/bakdata/dedupe/clustering/TransitiveClosure.java b/common/src/main/java/com/bakdata/dedupe/clustering/TransitiveClosure.java index 7968b46..20e7abc 100644 --- a/common/src/main/java/com/bakdata/dedupe/clustering/TransitiveClosure.java +++ b/common/src/main/java/com/bakdata/dedupe/clustering/TransitiveClosure.java @@ -60,7 +60,7 @@ public class TransitiveClosure, T, I extends Comparable< * A function to generate the id for newly formed clusters. */ @NonNull - Function, C> clusterIdGenerator; + Function, C> clusterIdGenerator; /** * A backing map for old clusters. Defaults to an in-memory map if null during construction. */ diff --git a/common/src/main/java/com/bakdata/dedupe/similarity/AggregatingSimilarityMeasure.java b/common/src/main/java/com/bakdata/dedupe/similarity/AggregatingSimilarityMeasure.java index 4b43357..ac1db81 100644 --- a/common/src/main/java/com/bakdata/dedupe/similarity/AggregatingSimilarityMeasure.java +++ b/common/src/main/java/com/bakdata/dedupe/similarity/AggregatingSimilarityMeasure.java @@ -46,7 +46,7 @@ public class AggregatingSimilarityMeasure implements SimilarityMeasure { */ @NonNull ToDoubleFunction aggregator; /** - * The similarity measures that will successively applied on the input values. + * The similarity measures that will successively apply on the input values. */ @NonNull List> similarityMeasures; diff --git a/core/src/main/java/com/bakdata/dedupe/clustering/Cluster.java b/core/src/main/java/com/bakdata/dedupe/clustering/Cluster.java index 374a944..5099b00 100644 --- a/core/src/main/java/com/bakdata/dedupe/clustering/Cluster.java +++ b/core/src/main/java/com/bakdata/dedupe/clustering/Cluster.java @@ -66,8 +66,8 @@ public Cluster(final @NonNull C id) { this(id, new ArrayList<>()); } - public void add(final @NonNull T record) { - this.elements.add(record); + public void add(final @NonNull T element) { + this.elements.add(element); } public int size() { @@ -78,8 +78,8 @@ public int size() { return this.elements.get(index); } - public boolean contains(final @NonNull T record) { - return this.elements.contains(record); + public boolean contains(final @NonNull T element) { + return this.elements.contains(element); } /** @@ -91,7 +91,7 @@ public boolean contains(final @NonNull T record) { * @return the newly created merged cluster or this iff {@code other == this}. */ public @NonNull Cluster merge( - final @NonNull Function, ? extends C> idGenerator, + final @NonNull Function, ? extends C> idGenerator, final @NonNull Function idExtractor, final @NonNull Cluster other) { if (other == this) { diff --git a/core/src/main/java/com/bakdata/dedupe/clustering/Clustering.java b/core/src/main/java/com/bakdata/dedupe/clustering/Clustering.java index 7e6ac0d..4f876f1 100644 --- a/core/src/main/java/com/bakdata/dedupe/clustering/Clustering.java +++ b/core/src/main/java/com/bakdata/dedupe/clustering/Clustering.java @@ -51,5 +51,6 @@ public interface Clustering, T, I> { * * @return the cluster id generator. */ - @NonNull Function, C> getClusterIdGenerator(); + @NonNull + Function, C> getClusterIdGenerator(); } diff --git a/core/src/main/java/com/bakdata/dedupe/clustering/Clusters.java b/core/src/main/java/com/bakdata/dedupe/clustering/Clusters.java index cbeb2ee..a12990f 100644 --- a/core/src/main/java/com/bakdata/dedupe/clustering/Clusters.java +++ b/core/src/main/java/com/bakdata/dedupe/clustering/Clusters.java @@ -48,11 +48,11 @@ public class Clusters { * @throws IllegalArgumentException when there is not exactly one cluster */ public static , T> @NonNull Cluster getContainingCluster( - final @NonNull Iterator> clusterIterator, final @NonNull T record) { + final @NonNull Iterator> clusterIterator, final @NonNull T element) { final Spliterator> spliterator = Spliterators.spliteratorUnknownSize(clusterIterator, ORDERED | NONNULL); final List> mainClusters = StreamSupport.stream(spliterator, false) - .filter(c -> c.contains(record)) + .filter(c -> c.contains(element)) .collect(Collectors.toList()); if (mainClusters.size() != 1) { throw new IllegalArgumentException( diff --git a/examples/src/test/java/com/bakdata/dedupe/person/PersonDeduplicationTest.java b/examples/src/test/java/com/bakdata/dedupe/person/PersonDeduplicationTest.java index 1dcb601..e73e39a 100644 --- a/examples/src/test/java/com/bakdata/dedupe/person/PersonDeduplicationTest.java +++ b/examples/src/test/java/com/bakdata/dedupe/person/PersonDeduplicationTest.java @@ -43,18 +43,23 @@ class PersonDeduplicationTest { private static final DateTimeFormatter BDAY_FORMAT = DateTimeFormatter.ofPattern("dd.MM.yy"); private static List parseCsv(final String resourceName) throws IOException { - final CSVFormat format = CSVFormat.newFormat('\t').withFirstRecordAsHeader().withQuote('"'); + final CSVFormat format = CSVFormat.DEFAULT.builder() + .setDelimiter('\t') + .setQuote('"') + .setHeader() + .setSkipHeaderRecord(true) + .get(); try (final CSVParser parser = CSVParser .parse(PersonDeduplicationTest.class.getResourceAsStream(resourceName), StandardCharsets.UTF_8, format)) { return parser.getRecords() .stream() - .map(record -> Person.builder() - .id(record.get("id")) - .firstName(record.get("firstname_full")) - .lastName(record.get("lastname")) - .birthDate(LocalDate.parse(record.get("birthdate"), BDAY_FORMAT)) - .gender(Gender.valueOf(record.get("gender").toUpperCase())) + .map(csvRecord -> Person.builder() + .id(csvRecord.get("id")) + .firstName(csvRecord.get("firstname_full")) + .lastName(csvRecord.get("lastname")) + .birthDate(LocalDate.parse(csvRecord.get("birthdate"), BDAY_FORMAT)) + .gender(Gender.valueOf(csvRecord.get("gender").toUpperCase())) .lastModified(LocalDateTime.now()) .build()) .collect(Collectors.toList()); diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 7ccaa6c..c0b694d 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -13,7 +13,7 @@ junit-jupiter = { group = "org.junit.jupiter", name = "junit-jupiter", version.r assertj = { group = "org.assertj", name = "assertj-core", version = "3.27.7" } [plugins] -release = { id = "com.bakdata.release", version = "2.1.0" } -sonar = { id = "com.bakdata.sonar", version = "2.1.0" } -sonatype = { id = "com.bakdata.sonatype", version = "2.1.0" } +release = { id = "com.bakdata.release", version = "2.2.0" } +sonar = { id = "com.bakdata.sonar", version = "2.2.0" } +sonatype = { id = "com.bakdata.sonatype", version = "2.2.0" } lombok = { id = "io.freefair.lombok", version = "9.2.0" }