Skip to content
Original file line number Diff line number Diff line change
Expand Up @@ -510,7 +510,47 @@ private void generateUnpartitionTableLocation() {
if (this.addFiles.isEmpty()) {
unpartitionTableLocation = createUnpartitionEmptyLocationForHive();
} else {
unpartitionTableLocation = TableFileUtil.getFileDir(this.addFiles.get(0).path().toString());
String firstFileDir = TableFileUtil.getFileDir(this.addFiles.get(0).path().toString());
for (DataFile dataFile : this.addFiles) {
String fileDir = TableFileUtil.getFileDir(dataFile.path().toString());
if (!new Path(firstFileDir).equals(new Path(fileDir))) {
throw new CannotAlterHiveLocationException(
"can't update hive location for non-partitioned table, "
+ "files are not under the same directory. "
+ "expected: "
+ firstFileDir
+ ", actual: "
+ fileDir);
}
}
checkNonPartitionedHiveLocationChange(firstFileDir);
unpartitionTableLocation = firstFileDir;
}
}

private void checkNonPartitionedHiveLocationChange(String newLocation) {
String currentHiveLocation = hiveTable.getSd().getLocation();
if (!validateLocation
|| currentHiveLocation == null
|| isPathEquals(newLocation, currentHiveLocation)) {
return;
}
Set<String> deleteFilePaths =
deleteFiles.stream().map(f -> f.path().toString()).collect(Collectors.toSet());
try (CloseableIterable<FileScanTask> tasks = table.newScan().planFiles()) {
for (FileScanTask task : tasks) {
String filePath = task.file().path().toString();
if (filePath.startsWith(currentHiveLocation) && !deleteFilePaths.contains(filePath)) {
throw new CannotAlterHiveLocationException(
"can't update hive location for non-partitioned table, "
+ "not all files in current hive location are deleted. "
+ "file: "
+ filePath
+ " is not in the delete set");
}
}
} catch (IOException e) {
throw new RuntimeException(e);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,6 @@ public void testOverwriteCleanUntrackedFiles() throws TException {

@Test
public void testOverwritePartFiles() throws TException {
// TODO should add cases for tables without partition spec
Assume.assumeTrue(isPartitionedTable());
getMixedTable()
.updateProperties()
Expand Down Expand Up @@ -321,9 +320,43 @@ public void testOverwritePartFiles() throws TException {
Assert.assertThrows(CannotAlterHiveLocationException.class, overwriteFiles::commit);
}

@Test
public void testOverwritePartFilesNonPartitioned() throws TException {
Assume.assumeFalse(isPartitionedTable());
getMixedTable()
.updateProperties()
.set(TableProperties.WRITE_TARGET_FILE_SIZE_BYTES, "1")
.commit();
List<Record> insertRecords = Lists.newArrayList();
insertRecords.add(tableTestHelper().generateTestRecord(1, "john", 0, "2022-01-01T12:00:00"));
insertRecords.add(tableTestHelper().generateTestRecord(2, "lily", 0, "2022-01-01T12:00:00"));
List<DataFile> firstDataFiles =
HiveDataTestHelpers.writerOf(getMixedTable()).transactionId(1L).writeHive(insertRecords);
Assert.assertEquals(2, firstDataFiles.size());
DataFile deleteFile = firstDataFiles.get(0);
UnkeyedTable baseStore = MixedTableUtil.baseStore(getMixedTable());
OverwriteFiles overwriteFiles = baseStore.newOverwrite();
firstDataFiles.forEach(overwriteFiles::addFile);
overwriteFiles.commit();
firstDataFiles = HiveDataTestHelpers.lastedAddedFiles(baseStore);
UpdateHiveFilesTestHelpers.validateHiveTableValues(
TEST_HMS.getHiveClient(), getMixedTable(), firstDataFiles);

// ================== test overwrite part files
insertRecords.clear();
insertRecords.add(tableTestHelper().generateTestRecord(2, "lily", 0, "2022-01-01T12:00:00"));
insertRecords.add(tableTestHelper().generateTestRecord(3, "john", 0, "2022-01-01T12:00:00"));
List<DataFile> secondDataFiles =
HiveDataTestHelpers.writerOf(getMixedTable()).transactionId(1L).writeHive(insertRecords);
overwriteFiles = baseStore.newOverwrite();
secondDataFiles.forEach(overwriteFiles::addFile);
overwriteFiles.deleteFile(deleteFile);

Assert.assertThrows(CannotAlterHiveLocationException.class, overwriteFiles::commit);
}

@Test
public void testOverwriteWithFilesUnderDifferentDir() {
// TODO should add cases for tables without partition spec
Assume.assumeTrue(isPartitionedTable());
UnkeyedTable baseStore = MixedTableUtil.baseStore(getMixedTable());
OverwriteFiles overwriteFiles = baseStore.newOverwrite();
Expand All @@ -343,9 +376,29 @@ public void testOverwriteWithFilesUnderDifferentDir() {
Assert.assertThrows(CannotAlterHiveLocationException.class, overwriteFiles::commit);
}

@Test
public void testOverwriteWithFilesUnderDifferentDirNonPartitioned() {
Assume.assumeFalse(isPartitionedTable());
UnkeyedTable baseStore = MixedTableUtil.baseStore(getMixedTable());
OverwriteFiles overwriteFiles = baseStore.newOverwrite();

List<Record> insertRecords = Lists.newArrayList();
insertRecords.add(tableTestHelper().generateTestRecord(1, "john", 0, "2022-01-01T12:00:00"));
insertRecords.add(tableTestHelper().generateTestRecord(2, "lily", 0, "2022-01-02T12:00:00"));
List<DataFile> firstDataFiles =
HiveDataTestHelpers.writerOf(getMixedTable()).transactionId(1L).writeHive(insertRecords);
firstDataFiles.forEach(overwriteFiles::addFile);

// write data files under another dir
List<DataFile> secondDataFiles =
HiveDataTestHelpers.writerOf(getMixedTable()).transactionId(1L).writeHive(insertRecords);
secondDataFiles.forEach(overwriteFiles::addFile);

Assert.assertThrows(CannotAlterHiveLocationException.class, overwriteFiles::commit);
}

@Test
public void testOverwriteByAddFilesInDifferentDir() throws TException {
// TODO should add cases for tables without partition spec
Assume.assumeTrue(isPartitionedTable());
List<Record> insertRecords = Lists.newArrayList();
insertRecords.add(tableTestHelper().generateTestRecord(1, "john", 0, "2022-01-01T12:00:00"));
Expand All @@ -372,9 +425,36 @@ public void testOverwriteByAddFilesInDifferentDir() throws TException {
Assert.assertThrows(CannotAlterHiveLocationException.class, overwriteFiles::commit);
}

@Test
public void testOverwriteByAddFilesInDifferentDirNonPartitioned() throws TException {
Assume.assumeFalse(isPartitionedTable());
List<Record> insertRecords = Lists.newArrayList();
insertRecords.add(tableTestHelper().generateTestRecord(1, "john", 0, "2022-01-01T12:00:00"));
insertRecords.add(tableTestHelper().generateTestRecord(2, "lily", 0, "2022-01-02T12:00:00"));
List<DataFile> firstDataFiles =
HiveDataTestHelpers.writerOf(getMixedTable()).transactionId(1L).writeHive(insertRecords);
UnkeyedTable baseStore = MixedTableUtil.baseStore(getMixedTable());
OverwriteFiles overwriteFiles = baseStore.newOverwrite();
firstDataFiles.forEach(overwriteFiles::addFile);
overwriteFiles.commit();
firstDataFiles = HiveDataTestHelpers.lastedAddedFiles(baseStore);
UpdateHiveFilesTestHelpers.validateHiveTableValues(
TEST_HMS.getHiveClient(), getMixedTable(), firstDataFiles);

// ================== test add files only in a different dir
insertRecords.clear();
insertRecords.add(tableTestHelper().generateTestRecord(2, "lily", 0, "2022-01-02T12:00:00"));
insertRecords.add(tableTestHelper().generateTestRecord(3, "john", 0, "2022-01-03T12:00:00"));
List<DataFile> secondDataFiles =
HiveDataTestHelpers.writerOf(getMixedTable()).transactionId(1L).writeHive(insertRecords);
overwriteFiles = baseStore.newOverwrite();
secondDataFiles.forEach(overwriteFiles::addFile);

Assert.assertThrows(CannotAlterHiveLocationException.class, overwriteFiles::commit);
}

@Test
public void testOverwriteWithSameLocation() throws TException {
// TODO should add cases for tables without partition spec
Assume.assumeTrue(isPartitionedTable());
List<Record> insertRecords = Lists.newArrayList();
insertRecords.add(tableTestHelper().generateTestRecord(1, "john", 0, "2022-01-01T12:00:00"));
Expand All @@ -395,4 +475,36 @@ public void testOverwriteWithSameLocation() throws TException {

Assert.assertThrows(CannotAlterHiveLocationException.class, overwriteFiles::commit);
}

@Test
public void testOverwriteWithSameLocationNonPartitioned() throws TException {
Assume.assumeFalse(isPartitionedTable());
List<Record> insertRecords = Lists.newArrayList();
insertRecords.add(tableTestHelper().generateTestRecord(1, "john", 0, "2022-01-01T12:00:00"));
insertRecords.add(tableTestHelper().generateTestRecord(2, "lily", 0, "2022-01-02T12:00:00"));
List<DataFile> dataFiles =
HiveDataTestHelpers.writerOf(getMixedTable()).transactionId(1L).writeHive(insertRecords);
UnkeyedTable baseStore = MixedTableUtil.baseStore(getMixedTable());
OverwriteFiles overwriteFiles = baseStore.newOverwrite();
dataFiles.forEach(overwriteFiles::addFile);
overwriteFiles.commit();
dataFiles = HiveDataTestHelpers.lastedAddedFiles(baseStore);
UpdateHiveFilesTestHelpers.validateHiveTableValues(
TEST_HMS.getHiveClient(), getMixedTable(), dataFiles);

overwriteFiles = baseStore.newOverwrite();
dataFiles.forEach(overwriteFiles::deleteFile);
dataFiles.forEach(overwriteFiles::addFile);

overwriteFiles.commit();

List<DataFile> afterFiles = HiveDataTestHelpers.lastedAddedFiles(baseStore);
Assert.assertEquals(dataFiles.size(), afterFiles.size());

int totalLiveFiles = Lists.newArrayList(baseStore.newScan().planFiles()).size();
Assert.assertEquals(dataFiles.size(), totalLiveFiles);

UpdateHiveFilesTestHelpers.validateHiveTableValues(
TEST_HMS.getHiveClient(), getMixedTable(), afterFiles);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,6 @@ public void testRewriteCleanUntrackedFiles() throws TException {

@Test
public void testRewritePartFiles() {
// TODO should add cases for tables without partition spec
Assume.assumeTrue(isPartitionedTable());
getMixedTable()
.updateProperties()
Expand Down Expand Up @@ -256,9 +255,41 @@ public void testRewritePartFiles() {
Assert.assertThrows(CannotAlterHiveLocationException.class, rewriteFiles::commit);
}

@Test
public void testRewritePartFilesNonPartitioned() {
Assume.assumeFalse(isPartitionedTable());
getMixedTable()
.updateProperties()
.set(TableProperties.WRITE_TARGET_FILE_SIZE_BYTES, "1")
.commit();
List<Record> insertRecords = Lists.newArrayList();
insertRecords.add(tableTestHelper().generateTestRecord(1, "john", 0, "2022-01-01T12:00:00"));
insertRecords.add(tableTestHelper().generateTestRecord(2, "lily", 0, "2022-01-01T12:00:00"));
initDataFiles =
HiveDataTestHelpers.writerOf(getMixedTable()).transactionId(1L).writeHive(insertRecords);
UnkeyedTable baseStore = MixedTableUtil.baseStore(getMixedTable());
OverwriteFiles overwriteFiles = baseStore.newOverwrite();
initDataFiles.forEach(overwriteFiles::addFile);
overwriteFiles.commit();

initDataFiles = HiveDataTestHelpers.lastedAddedFiles(baseStore);
Assert.assertEquals(2, initDataFiles.size());
DataFile deleteFile = initDataFiles.get(0);

// ================== test rewrite part files
insertRecords.clear();
insertRecords.add(tableTestHelper().generateTestRecord(2, "lily", 0, "2022-01-01T12:00:00"));
insertRecords.add(tableTestHelper().generateTestRecord(3, "john", 0, "2022-01-01T12:00:00"));
List<DataFile> dataFiles =
HiveDataTestHelpers.writerOf(getMixedTable()).transactionId(1L).writeHive(insertRecords);

RewriteFiles rewriteFiles = baseStore.newRewrite();
rewriteFiles.rewriteFiles(Sets.newHashSet(deleteFile), Sets.newHashSet(dataFiles));
Assert.assertThrows(CannotAlterHiveLocationException.class, rewriteFiles::commit);
}

@Test
public void testRewriteWithFilesUnderDifferentDir() {
// TODO should add cases for tables without partition spec
Assume.assumeTrue(isPartitionedTable());
initDataFiles();
List<Record> insertRecords = Lists.newArrayList();
Expand All @@ -280,9 +311,31 @@ public void testRewriteWithFilesUnderDifferentDir() {
Assert.assertThrows(CannotAlterHiveLocationException.class, rewriteFiles::commit);
}

@Test
public void testRewriteWithFilesUnderDifferentDirNonPartitioned() {
Assume.assumeFalse(isPartitionedTable());
initDataFiles();
List<Record> insertRecords = Lists.newArrayList();
insertRecords.add(tableTestHelper().generateTestRecord(1, "john", 0, "2022-01-01T12:00:00"));
insertRecords.add(tableTestHelper().generateTestRecord(2, "lily", 0, "2022-01-02T12:00:00"));
Set<DataFile> addFiles = Sets.newHashSet();
List<DataFile> dataFiles =
HiveDataTestHelpers.writerOf(getMixedTable()).transactionId(1L).writeHive(insertRecords);
addFiles.addAll(dataFiles);
// write data files under another dir
dataFiles =
HiveDataTestHelpers.writerOf(getMixedTable()).transactionId(1L).writeHive(insertRecords);
addFiles.addAll(dataFiles);

UnkeyedTable baseStore = MixedTableUtil.baseStore(getMixedTable());
RewriteFiles rewriteFiles = baseStore.newRewrite();
rewriteFiles.rewriteFiles(Sets.newHashSet(initDataFiles), addFiles);

Assert.assertThrows(CannotAlterHiveLocationException.class, rewriteFiles::commit);
}

@Test
public void testRewriteByAddFilesInDifferentDir() {
// TODO should add cases for tables without partition spec
Assume.assumeTrue(isPartitionedTable());
initDataFiles();
List<Record> insertRecords = Lists.newArrayList();
Expand All @@ -300,9 +353,27 @@ public void testRewriteByAddFilesInDifferentDir() {
Assert.assertThrows(CannotAlterHiveLocationException.class, rewriteFiles::commit);
}

@Test
public void testRewriteByAddFilesInDifferentDirNonPartitioned() {
Assume.assumeFalse(isPartitionedTable());
initDataFiles();
List<Record> insertRecords = Lists.newArrayList();
insertRecords.add(tableTestHelper().generateTestRecord(2, "lily", 0, "2022-01-02T12:00:00"));
insertRecords.add(tableTestHelper().generateTestRecord(3, "john", 0, "2022-01-03T12:00:00"));
List<DataFile> dataFiles =
HiveDataTestHelpers.writerOf(getMixedTable()).transactionId(1L).writeHive(insertRecords);
Set<DataFile> addFiles = Sets.newHashSet(dataFiles);
addFiles.addAll(initDataFiles);

UnkeyedTable baseStore = MixedTableUtil.baseStore(getMixedTable());
RewriteFiles rewriteFiles = baseStore.newRewrite();
rewriteFiles.rewriteFiles(Sets.newHashSet(initDataFiles), addFiles);

Assert.assertThrows(CannotAlterHiveLocationException.class, rewriteFiles::commit);
}

@Test
public void testRewriteWithSameLocation() {
// TODO should add cases for tables without partition spec
Assume.assumeTrue(isPartitionedTable());
initDataFiles();
UnkeyedTable baseStore = MixedTableUtil.baseStore(getMixedTable());
Expand All @@ -311,4 +382,24 @@ public void testRewriteWithSameLocation() {

Assert.assertThrows(CannotAlterHiveLocationException.class, rewriteFiles::commit);
}

@Test
public void testRewriteWithSameLocationNonPartitioned() throws TException {
Assume.assumeFalse(isPartitionedTable());
initDataFiles();
UnkeyedTable baseStore = MixedTableUtil.baseStore(getMixedTable());
RewriteFiles rewriteFiles = baseStore.newRewrite();
rewriteFiles.rewriteFiles(Sets.newHashSet(initDataFiles), Sets.newHashSet(initDataFiles));

rewriteFiles.commit();

List<DataFile> afterFiles = HiveDataTestHelpers.lastedAddedFiles(baseStore);
Assert.assertEquals(initDataFiles.size(), afterFiles.size());

int totalLiveFiles = Lists.newArrayList(baseStore.newScan().planFiles()).size();
Assert.assertEquals(initDataFiles.size(), totalLiveFiles);

UpdateHiveFilesTestHelpers.validateHiveTableValues(
TEST_HMS.getHiveClient(), getMixedTable(), afterFiles);
}
}
Loading