diff --git a/.clangd b/.clangd
new file mode 100644
index 0000000..e69de29
diff --git a/.gitmodules b/.gitmodules
deleted file mode 100644
index 1404ab2..0000000
--- a/.gitmodules
+++ /dev/null
@@ -1,3 +0,0 @@
-[submodule "PinataTests/PQClean"]
-	path = PinataTests/PQClean
-	url = https://github.com/PQClean/PQClean.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9c3ea2a..e9e3ceb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,5 @@
 cmake_minimum_required(VERSION 3.16)
-project(Pinata VERSION 3.2 LANGUAGES C ASM)
+project(Pinata VERSION 4.0 LANGUAGES C ASM)
 
 if(NOT CMAKE_BUILD_TYPE OR CMAKE_BUILD_TYPE STREQUAL "")
     message(STATUS "Setting CMAKE_BUILD_TYPE to MinSizeRel")
diff --git a/PinataTests/CMakeLists.txt b/PinataTests/CMakeLists.txt
index b73f54e..60eff6c 100644
--- a/PinataTests/CMakeLists.txt
+++ b/PinataTests/CMakeLists.txt
@@ -1,13 +1,31 @@
 cmake_minimum_required(VERSION 3.16)
-project(PinataTests C CXX)
+
+if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.27)
+    cmake_policy(SET CMP0144 NEW)
+    if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.30)
+        cmake_policy(SET CMP0167 NEW)
+    endif()
+endif()
+
+project(PinataTests VERSION 4.0 LANGUAGES C CXX)
 
 find_package(OpenSSL REQUIRED)
 find_package(GTest REQUIRED)
 find_package(Boost REQUIRED)
 
-set(DILITHIUM PQClean/crypto_sign/dilithium3/clean)
-set(KYBER PQClean/crypto_kem/kyber512/clean)
-set(COMMON PQClean/common)
+include(FetchContent)
+FetchContent_Declare(
+    pqm4
+    GIT_REPOSITORY https://github.com/mupq/pqm4.git
+    # Keep this git hash the same as the one in src/CMakeLists.txt !
+    GIT_TAG a24bb4b662016968c19f5e6a0719c9ad530f0286
+)
+FetchContent_MakeAvailable(pqm4)
+
+# We can reuse the PQClean sources checked out by PQM4.
+set(DILITHIUM "${pqm4_SOURCE_DIR}/mupq/pqclean/crypto_sign/ml-dsa-65/clean")
+set(KYBER "${pqm4_SOURCE_DIR}/mupq/pqclean/crypto_kem/ml-kem-512/clean")
+set(COMMON "${pqm4_SOURCE_DIR}/mupq/pqclean/common")
 
 add_executable(PinataTests
     main.cpp
@@ -39,5 +57,7 @@ add_executable(PinataTests
 )
 
 target_compile_features(PinataTests PRIVATE cxx_std_20)
-target_include_directories(PinataTests PRIVATE PQClean/common)
+target_include_directories(PinataTests PRIVATE "${pqm4_SOURCE_DIR}/mupq/pqclean/common")
+set_source_files_properties(PqcFirmware.cpp PROPERTIES INCLUDE_DIRECTORIES "${pqm4_SOURCE_DIR}/mupq/pqclean")
 target_link_libraries(PinataTests PRIVATE Boost::boost OpenSSL::Crypto GTest::GTest)
+
diff --git a/PinataTests/PQClean b/PinataTests/PQClean
deleted file mode 160000
index 4f86c39..0000000
--- a/PinataTests/PQClean
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 4f86c3951562af84ea4036a7e0483a9e96cebba9
diff --git a/PinataTests/PqcFirmware.cpp b/PinataTests/PqcFirmware.cpp
index 85fd533..d94c285 100644
--- a/PinataTests/PqcFirmware.cpp
+++ b/PinataTests/PqcFirmware.cpp
@@ -4,42 +4,39 @@
 #include <openssl/rand.h>
 
 extern "C" {
-#include "PQClean/crypto_kem/kyber512/clean/api.h"
-#include "PQClean/crypto_sign/dilithium3/clean/api.h"
+#include "crypto_kem/ml-kem-512/clean/api.h"
+#include "crypto_sign/ml-dsa-65/clean/api.h"
 }
 
-#define DILITHIUM_PUBLIC_KEY_SIZE 1952
-#define DILITHIUM_PRIVATE_KEY_SIZE 4016
-#define DILITHIUM_SIGNATURE_SIZE 3293
-#define DILITHIUM_MESSAGE_SIZE 16
-#define DILITHIUM_SIGNED_MESSAGE_SIZE (DILITHIUM_SIGNATURE_SIZE + DILITHIUM_MESSAGE_SIZE)
+#define MLDSA_PUBLIC_KEY_SIZE 1952
+#define MLDSA_PRIVATE_KEY_SIZE 4032
+#define MLDSA_SIGNATURE_SIZE 3309
+#define MLDSA_MESSAGE_SIZE 16
+#define MLDSA_N 256
+#define MLDSA_SIGNED_MESSAGE_SIZE (MLDSA_SIGNATURE_SIZE + MLDSA_MESSAGE_SIZE)
 
-#define KYBER512_PUBLIC_KEY_SIZE 800
-#define KYBER512_PRIVATE_KEY_SIZE 1632
-#define KYBER512_SHARED_SECRET_SIZE 32
-#define KYBER512_CIPHERTEXT_SIZE 768
+#define MLKEM_PUBLIC_KEY_SIZE 800
+#define MLKEM_PRIVATE_KEY_SIZE 1632
+#define MLKEM_SHARED_SECRET_SIZE 32
+#define MLKEM_CIPHERTEXT_SIZE 768
 
-#if DILITHIUM_PUBLIC_KEY_SIZE != PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_PUBLICKEYBYTES
+#if MLDSA_PUBLIC_KEY_SIZE != PQCLEAN_MLDSA65_CLEAN_CRYPTO_PUBLICKEYBYTES
 #error invalid public key size, update me!
 #endif
-#if DILITHIUM_PRIVATE_KEY_SIZE != PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_SECRETKEYBYTES
+#if MLDSA_PRIVATE_KEY_SIZE != PQCLEAN_MLDSA65_CLEAN_CRYPTO_SECRETKEYBYTES
 #error invalid private key size, update me!
 #endif
-#if DILITHIUM_SIGNATURE_SIZE != PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES
+#if MLDSA_SIGNATURE_SIZE != PQCLEAN_MLDSA65_CLEAN_CRYPTO_BYTES
 #error invalid signature size, update me!
 #endif
 
-#if defined(MODE) && !defined(DILITHIUM_MODE)
-#define DILITHIUM_MODE MODE
-#endif
-
-#if KYBER512_PUBLIC_KEY_SIZE != PQCLEAN_KYBER512_CLEAN_CRYPTO_PUBLICKEYBYTES
+#if MLKEM_PUBLIC_KEY_SIZE != PQCLEAN_MLKEM512_CLEAN_CRYPTO_PUBLICKEYBYTES
 #error invalid public key size, update me!
 #endif
-#if KYBER512_PRIVATE_KEY_SIZE != PQCLEAN_KYBER512_CLEAN_CRYPTO_SECRETKEYBYTES
+#if MLKEM_PRIVATE_KEY_SIZE != PQCLEAN_MLKEM512_CLEAN_CRYPTO_SECRETKEYBYTES
 #error invalid secret key size, update me!
 #endif
-#if KYBER512_SHARED_SECRET_SIZE != PQCLEAN_KYBER512_CLEAN_CRYPTO_BYTES
+#if MLKEM_SHARED_SECRET_SIZE != PQCLEAN_MLKEM512_CLEAN_CRYPTO_BYTES
 #error invalid secret size, update me!
 #endif
 
@@ -52,28 +49,28 @@ class PqcFirmware : public TestBase {
 };
 
 TEST_F(PqcFirmware, DilithiumLevel3) {
-    std::array<unsigned char, DILITHIUM_PUBLIC_KEY_SIZE> publicKey;
-    std::array<unsigned char, DILITHIUM_PRIVATE_KEY_SIZE> privateKey;
-    std::array<unsigned char, DILITHIUM_MESSAGE_SIZE> message;
-    std::array<unsigned char, PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES + DILITHIUM_MESSAGE_SIZE> pinataSignedMessage;
-    std::array<unsigned char, PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES + DILITHIUM_MESSAGE_SIZE> referenceSignedMessage;
+    std::array<unsigned char, MLDSA_PUBLIC_KEY_SIZE> publicKey;
+    std::array<unsigned char, MLDSA_PRIVATE_KEY_SIZE> privateKey;
+    std::array<unsigned char, MLDSA_MESSAGE_SIZE> message;
+    std::array<unsigned char, PQCLEAN_MLDSA65_CLEAN_CRYPTO_BYTES + MLDSA_MESSAGE_SIZE> pinataSignedMessage;
+    std::array<unsigned char, PQCLEAN_MLDSA65_CLEAN_CRYPTO_BYTES + MLDSA_MESSAGE_SIZE> referenceSignedMessage;
 
     // Ensure the mode is the same
     std::cerr << "asserting security level\n";
-    ASSERT_EQ(mClient.dilithiumGetSecurityLevel(), 3);
+    ASSERT_EQ(mClient.mldsaGetSecurityLevel(), 3);
 
     // Ensure public and private key sizes match
     std::cerr << "checking key sizes\n";
-    const auto [pinataPublicKeySize, pinataPrivateKeySize] = mClient.dilithiumGetKeySizes();
-    ASSERT_EQ(pinataPublicKeySize, PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_PUBLICKEYBYTES);
-    ASSERT_EQ(pinataPrivateKeySize, PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_SECRETKEYBYTES);
+    const auto [pinataPublicKeySize, pinataPrivateKeySize] = mClient.mldsaGetKeySizes();
+    ASSERT_EQ(pinataPublicKeySize, PQCLEAN_MLDSA65_CLEAN_CRYPTO_PUBLICKEYBYTES);
+    ASSERT_EQ(pinataPrivateKeySize, PQCLEAN_MLDSA65_CLEAN_CRYPTO_SECRETKEYBYTES);
 
     // Generate a public/private key pair with the reference X86 implementation
-    PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_keypair(publicKey.data(), privateKey.data());
+    PQCLEAN_MLDSA65_CLEAN_crypto_sign_keypair(publicKey.data(), privateKey.data());
 
     // Tell the pinata to use this public/private key pair for signing with Dilithium3
     std::cerr << "setup public/private key pair\n";
-    mClient.dilithiumSetPublicPrivateKeyPair(publicKey.data(), publicKey.size(), privateKey.data(), privateKey.size());
+    mClient.mldsaSetPublicPrivateKeyPair(publicKey.data(), publicKey.size(), privateKey.data(), privateKey.size());
 
     // Prepare the random message
     std::fill(pinataSignedMessage.begin(), pinataSignedMessage.end(), (unsigned char)0);
@@ -81,58 +78,58 @@ TEST_F(PqcFirmware, DilithiumLevel3) {
 
     // Sign the fuzzed message on pinata
     std::cerr << "sign message\n";
-    mClient.dilithiumSign(message.data(), message.size(), pinataSignedMessage.data(),
-                          PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES);
+    mClient.mldsaSign(message.data(), message.size(), pinataSignedMessage.data(),
+                          PQCLEAN_MLDSA65_CLEAN_CRYPTO_BYTES);
 
     // Concatenate the signature and the fuzzed message together to obtain a "signed message"
-    ASSERT_EQ(pinataSignedMessage.size(), PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES + message.size());
-    std::copy(message.begin(), message.end(), pinataSignedMessage.data() + PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES);
+    ASSERT_EQ(pinataSignedMessage.size(), PQCLEAN_MLDSA65_CLEAN_CRYPTO_BYTES + message.size());
+    std::copy(message.begin(), message.end(), pinataSignedMessage.data() + PQCLEAN_MLDSA65_CLEAN_CRYPTO_BYTES);
 
     // The message should be at the end of the signed message buffer
-    ASSERT_EQ(std::memcmp(pinataSignedMessage.data() + PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES, message.begin(), 16), 0);
+    ASSERT_EQ(std::memcmp(pinataSignedMessage.data() + PQCLEAN_MLDSA65_CLEAN_CRYPTO_BYTES, message.begin(), 16), 0);
 
     // Sign the fuzzed message with the X86 reference implementation.
     // The reference implementation doesn't use randomized signatures.
     unsigned long messageLength = static_cast<unsigned long>(pinataSignedMessage.size());
-    PQCLEAN_DILITHIUM3_CLEAN_crypto_sign(referenceSignedMessage.data(), &messageLength, message.data(), message.size(),
+    PQCLEAN_MLDSA65_CLEAN_crypto_sign(referenceSignedMessage.data(), &messageLength, message.data(), message.size(),
                                          privateKey.data());
     ASSERT_EQ(messageLength, referenceSignedMessage.size());
 
     // Pinata sign --> Reference verify
-    ASSERT_EQ(PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_open(pinataSignedMessage.data(), &messageLength,
+    ASSERT_EQ(PQCLEAN_MLDSA65_CLEAN_crypto_sign_open(pinataSignedMessage.data(), &messageLength,
                                                         pinataSignedMessage.data(), pinataSignedMessage.size(),
                                                         publicKey.data()),
               0);
 
     // Reference sign --> Pinata verify
     std::cerr << "verify message\n";
-    ASSERT_TRUE(mClient.dilithiumVerify(referenceSignedMessage.data(), referenceSignedMessage.size()));
+    ASSERT_TRUE(mClient.mldsaVerify(referenceSignedMessage.data(), referenceSignedMessage.size()));
 }
 
 TEST_F(PqcFirmware, Kyber512) {
-    std::array<unsigned char, KYBER512_PUBLIC_KEY_SIZE> publicKey;
-    std::array<unsigned char, KYBER512_PRIVATE_KEY_SIZE> privateKey;
-    std::array<unsigned char, PQCLEAN_KYBER512_CLEAN_CRYPTO_BYTES> ssPinata;
-    std::array<unsigned char, PQCLEAN_KYBER512_CLEAN_CRYPTO_BYTES> ssRef;
-    std::array<unsigned char, PQCLEAN_KYBER512_CLEAN_CRYPTO_BYTES> ssPinataGenerateRefDecode;
-    std::array<unsigned char, PQCLEAN_KYBER512_CLEAN_CRYPTO_BYTES> ssRefGeneratePinataDecode;
-    std::array<unsigned char, PQCLEAN_KYBER512_CLEAN_CRYPTO_BYTES> ssRefGenerateRefDecode;
-    std::array<unsigned char, PQCLEAN_KYBER512_CLEAN_CRYPTO_BYTES> ssPinataGeneratePinataDecode;
-    std::array<unsigned char, PQCLEAN_KYBER512_CLEAN_CRYPTO_CIPHERTEXTBYTES> ctPinata;
-    std::array<unsigned char, PQCLEAN_KYBER512_CLEAN_CRYPTO_CIPHERTEXTBYTES> ctRef;
+    std::array<unsigned char, MLKEM_PUBLIC_KEY_SIZE> publicKey;
+    std::array<unsigned char, MLKEM_PRIVATE_KEY_SIZE> privateKey;
+    std::array<unsigned char, PQCLEAN_MLKEM512_CLEAN_CRYPTO_BYTES> ssPinata;
+    std::array<unsigned char, PQCLEAN_MLKEM512_CLEAN_CRYPTO_BYTES> ssRef;
+    std::array<unsigned char, PQCLEAN_MLKEM512_CLEAN_CRYPTO_BYTES> ssPinataGenerateRefDecode;
+    std::array<unsigned char, PQCLEAN_MLKEM512_CLEAN_CRYPTO_BYTES> ssRefGeneratePinataDecode;
+    std::array<unsigned char, PQCLEAN_MLKEM512_CLEAN_CRYPTO_BYTES> ssRefGenerateRefDecode;
+    std::array<unsigned char, PQCLEAN_MLKEM512_CLEAN_CRYPTO_BYTES> ssPinataGeneratePinataDecode;
+    std::array<unsigned char, PQCLEAN_MLKEM512_CLEAN_CRYPTO_CIPHERTEXTBYTES> ctPinata;
+    std::array<unsigned char, PQCLEAN_MLKEM512_CLEAN_CRYPTO_CIPHERTEXTBYTES> ctRef;
 
     // Ensure public and private key sizes match
     std::cerr << "checking wether key sizes agree\n";
-    const auto [pinataPublicKeySize, pinataPrivateKeySize] = mClient.kyber512GetKeySizes();
-    ASSERT_EQ(pinataPublicKeySize, PQCLEAN_KYBER512_CLEAN_CRYPTO_PUBLICKEYBYTES);
-    ASSERT_EQ(pinataPrivateKeySize, PQCLEAN_KYBER512_CLEAN_CRYPTO_SECRETKEYBYTES);
+    const auto [pinataPublicKeySize, pinataPrivateKeySize] = mClient.mlkemGetKeySizes();
+    ASSERT_EQ(pinataPublicKeySize, PQCLEAN_MLKEM512_CLEAN_CRYPTO_PUBLICKEYBYTES);
+    ASSERT_EQ(pinataPrivateKeySize, PQCLEAN_MLKEM512_CLEAN_CRYPTO_SECRETKEYBYTES);
 
     // Generate a public/private key pair with the reference X86 implementation
-    PQCLEAN_KYBER512_CLEAN_crypto_kem_keypair(publicKey.data(), privateKey.data());
+    PQCLEAN_MLKEM512_CLEAN_crypto_kem_keypair(publicKey.data(), privateKey.data());
 
-    // Tell the pinata to use this public/private key pair for encrypting shared secrets with kyber512.
+    // Tell the pinata to use this public/private key pair for encrypting shared secrets with mlkem.
     std::cerr << "setting public private key pair\n";
-    mClient.kyber512SetPublicPrivateKeyPair(publicKey.data(), publicKey.size(), privateKey.data(), privateKey.size());
+    mClient.mlkemSetPublicPrivateKeyPair(publicKey.data(), publicKey.size(), privateKey.data(), privateKey.size());
 
     // Zero out arrays
     std::fill(ssPinataGenerateRefDecode.begin(), ssPinataGenerateRefDecode.end(), (unsigned char)0);
@@ -142,25 +139,25 @@ TEST_F(PqcFirmware, Kyber512) {
 
     // Generate a shared secret on Pinata
     std::cerr << "generating shared secret\n";
-    mClient.kyber512Generate(ssPinata.data(), ssPinata.size(), ctPinata.data(), ctPinata.size());
+    mClient.mlkemGenerate(ssPinata.data(), ssPinata.size(), ctPinata.data(), ctPinata.size());
 
     // Generate a shared secret with the reference implementation.
-    PQCLEAN_KYBER512_CLEAN_crypto_kem_enc(ctRef.data(), ssRef.data(), publicKey.data());
+    PQCLEAN_MLKEM512_CLEAN_crypto_kem_enc(ctRef.data(), ssRef.data(), publicKey.data());
 
     // Decode the Pinata ciphertext with ref impl
-    PQCLEAN_KYBER512_CLEAN_crypto_kem_dec(ssPinataGenerateRefDecode.data(), ctPinata.data(), privateKey.data());
+    PQCLEAN_MLKEM512_CLEAN_crypto_kem_dec(ssPinataGenerateRefDecode.data(), ctPinata.data(), privateKey.data());
 
     // Decode the ref ciphertext with ref impl
-    PQCLEAN_KYBER512_CLEAN_crypto_kem_dec(ssRefGenerateRefDecode.data(), ctRef.data(), privateKey.data());
+    PQCLEAN_MLKEM512_CLEAN_crypto_kem_dec(ssRefGenerateRefDecode.data(), ctRef.data(), privateKey.data());
 
     // Decode the Pinata ciphertext with Pinata impl
     std::cerr << "decoding shared secret\n";
-    mClient.kyber512Decode(ctPinata.data(), ctPinata.size(), ssPinataGeneratePinataDecode.data(),
+    mClient.mlkemDecode(ctPinata.data(), ctPinata.size(), ssPinataGeneratePinataDecode.data(),
                            ssPinataGeneratePinataDecode.size());
 
     // Decode the ref ciphertext with Pinata impl
     std::cerr << "decoding shared secret (ref)\n";
-    mClient.kyber512Decode(ctRef.data(), ctRef.size(), ssRefGeneratePinataDecode.data(),
+    mClient.mlkemDecode(ctRef.data(), ctRef.size(), ssRefGeneratePinataDecode.data(),
                            ssRefGeneratePinataDecode.size());
 
     ASSERT_EQ(ssPinata, ssPinataGeneratePinataDecode);
diff --git a/PinataTests/common.cpp b/PinataTests/common.cpp
index 8e1bbd9..026cc1b 100644
--- a/PinataTests/common.cpp
+++ b/PinataTests/common.cpp
@@ -14,22 +14,22 @@
 #include <termios.h>
 #include <unistd.h>
 
-constexpr const size_t PINATA_DILITHIUM_MESSAGE_LENGTH = 16;
-constexpr const size_t PINATA_KYBER512_SHARED_SECRET_LENGTH = 32;
+constexpr const size_t PINATA_MLDSA_MESSAGE_LENGTH = 16;
+constexpr const size_t PINATA_MLKEM_SHARED_SECRET_LENGTH = 32;
 
 const uint8_t CMD_GET_CODE_REV = 0xF1;
 const uint8_t CMD_HWAES128_ENC = 0xCA;
 
-const uint8_t CMD_SW_DILITHIUM_GET_VARIANT = 0x90;
-const uint8_t CMD_SW_DILITHIUM_SET_PUBLIC_AND_PRIVATE_KEY = 0x91;
-const uint8_t CMD_SW_DILITHIUM_VERIFY = 0x92;
-const uint8_t CMD_SW_DILITHIUM_SIGN = 0x93;
-const uint8_t CMD_SW_DILITHIUM_GET_KEY_SIZES = 0x94;
+const uint8_t CMD_SW_MLDSA_GET_VARIANT = 0x90;
+const uint8_t CMD_SW_MLDSA_SET_PUBLIC_AND_PRIVATE_KEY = 0x91;
+const uint8_t CMD_SW_MLDSA_VERIFY = 0x92;
+const uint8_t CMD_SW_MLDSA_SIGN = 0x93;
+const uint8_t CMD_SW_MLDSA_GET_KEY_SIZES = 0x94;
 
-const uint8_t CMD_SW_KYBER512_SET_PUBLIC_AND_PRIVATE_KEY = 0x02;
-const uint8_t CMD_SW_KYBER512_GET_KEY_SIZES = 0x03;
-const uint8_t CMD_SW_KYBER512_GENERATE = 0x04;
-const uint8_t CMD_SW_KYBER512_DEC = 0x05;
+const uint8_t CMD_SW_MLKEM_SET_PUBLIC_AND_PRIVATE_KEY = 0x02;
+const uint8_t CMD_SW_MLKEM_GET_KEY_SIZES = 0x03;
+const uint8_t CMD_SW_MLKEM_GENERATE = 0x04;
+const uint8_t CMD_SW_MLKEM_DEC = 0x05;
 
 const uint8_t CMD_SWDES_ENC = 0x44;
 const uint8_t CMD_SWDES_DEC = 0x45;
@@ -89,7 +89,7 @@ std::pair<int, int> PinataClient::getVersion() {
 
 FirmwareVariant PinataClient::determineFirmwareVariant() {
     // Detect it via this command. It will return "BadCmd\n" when dealing with a classic or hw variant.
-    command(CMD_SW_DILITHIUM_GET_VARIANT);
+    command(CMD_SW_MLDSA_GET_VARIANT);
     uint8_t byte;
     read(&byte, sizeof(byte));
     // If we're dealing with a PQC variant then this should return the number "3".
@@ -122,21 +122,21 @@ FirmwareVariant PinataClient::determineFirmwareVariant() {
     return FirmwareVariant::Classic;
 }
 
-uint8_t PinataClient::dilithiumGetSecurityLevel() {
-    command(CMD_SW_DILITHIUM_GET_VARIANT);
+uint8_t PinataClient::mldsaGetSecurityLevel() {
+    command(CMD_SW_MLDSA_GET_VARIANT);
     return readNumber<uint8_t>();
 }
 
-std::pair<int, int> PinataClient::dilithiumGetKeySizes() {
-    command(CMD_SW_DILITHIUM_GET_KEY_SIZES);
+std::pair<int, int> PinataClient::mldsaGetKeySizes() {
+    command(CMD_SW_MLDSA_GET_KEY_SIZES);
     const uint16_t publicKeySize = readNumber<uint16_t>();
     const uint16_t privateKeySize = readNumber<uint16_t>();
     return std::make_pair(publicKeySize, privateKeySize);
 }
 
-void PinataClient::dilithiumSetPublicPrivateKeyPair(const uint8_t *publicKey, size_t publicKeySize,
-                                                    const uint8_t *privateKey, size_t privateKeySize) {
-    command(CMD_SW_DILITHIUM_SET_PUBLIC_AND_PRIVATE_KEY);
+void PinataClient::mldsaSetPublicPrivateKeyPair(const uint8_t *publicKey, size_t publicKeySize,
+                                                const uint8_t *privateKey, size_t privateKeySize) {
+    command(CMD_SW_MLDSA_SET_PUBLIC_AND_PRIVATE_KEY);
     write(publicKey, publicKeySize);
     write(privateKey, privateKeySize);
     if (readNumber<uint8_t>() != 0) {
@@ -144,9 +144,9 @@ void PinataClient::dilithiumSetPublicPrivateKeyPair(const uint8_t *publicKey, si
     }
 }
 
-void PinataClient::dilithiumSign(const uint8_t *messageBuffer, size_t messageBufferSize, uint8_t *signedMessageBuffer,
-                                 size_t signedMessageBufferSize) {
-    command(CMD_SW_DILITHIUM_SIGN);
+void PinataClient::mldsaSign(const uint8_t *messageBuffer, size_t messageBufferSize, uint8_t *signedMessageBuffer,
+                             size_t signedMessageBufferSize) {
+    command(CMD_SW_MLDSA_SIGN);
     write(messageBuffer, messageBufferSize);
     if (readNumber<uint8_t>() != 0) {
         throw std::runtime_error("pinata failed to sign this message");
@@ -154,22 +154,22 @@ void PinataClient::dilithiumSign(const uint8_t *messageBuffer, size_t messageBuf
     read(signedMessageBuffer, signedMessageBufferSize);
 }
 
-bool PinataClient::dilithiumVerify(const uint8_t *signatureBuffer, size_t signatureBufferSize) {
-    command(CMD_SW_DILITHIUM_VERIFY);
+bool PinataClient::mldsaVerify(const uint8_t *signatureBuffer, size_t signatureBufferSize) {
+    command(CMD_SW_MLDSA_VERIFY);
     write(signatureBuffer, signatureBufferSize);
     return readNumber<uint8_t>() == 0;
 }
 
-std::pair<int, int> PinataClient::kyber512GetKeySizes() {
-    command(CMD_SW_KYBER512_GET_KEY_SIZES);
+std::pair<int, int> PinataClient::mlkemGetKeySizes() {
+    command(CMD_SW_MLKEM_GET_KEY_SIZES);
     const uint16_t publicKeySize = readNumber<uint16_t>();
     const uint16_t privateKeySize = readNumber<uint16_t>();
     return std::make_pair(publicKeySize, privateKeySize);
 }
 
-void PinataClient::kyber512SetPublicPrivateKeyPair(const uint8_t *publicKey, size_t publicKeySize,
-                                                   const uint8_t *privateKey, size_t privateKeySize) {
-    command(CMD_SW_KYBER512_SET_PUBLIC_AND_PRIVATE_KEY);
+void PinataClient::mlkemSetPublicPrivateKeyPair(const uint8_t *publicKey, size_t publicKeySize,
+                                                const uint8_t *privateKey, size_t privateKeySize) {
+    command(CMD_SW_MLKEM_SET_PUBLIC_AND_PRIVATE_KEY);
     write(publicKey, publicKeySize);
     write(privateKey, privateKeySize);
     if (readNumber<uint8_t>() != 0) {
@@ -177,9 +177,9 @@ void PinataClient::kyber512SetPublicPrivateKeyPair(const uint8_t *publicKey, siz
     }
 }
 
-void PinataClient::kyber512Generate(uint8_t *sharedSecretBuffer, size_t sharedSecretBufferSize,
-                                    uint8_t *keyEncapsulationMessageBuffer, size_t keyEncapsulationMessageBufferSize) {
-    command(CMD_SW_KYBER512_GENERATE);
+void PinataClient::mlkemGenerate(uint8_t *sharedSecretBuffer, size_t sharedSecretBufferSize,
+                                 uint8_t *keyEncapsulationMessageBuffer, size_t keyEncapsulationMessageBufferSize) {
+    command(CMD_SW_MLKEM_GENERATE);
     if (readNumber<uint8_t>() != 0) {
         throw std::runtime_error("failed to generate shared secret");
     }
@@ -187,10 +187,9 @@ void PinataClient::kyber512Generate(uint8_t *sharedSecretBuffer, size_t sharedSe
     read(keyEncapsulationMessageBuffer, keyEncapsulationMessageBufferSize);
 }
 
-void PinataClient::kyber512Decode(const uint8_t *keyEncapsulationMessageBuffer,
-                                  size_t keyEncapsulationMessageBufferSize, uint8_t *sharedSecretBuffer,
-                                  size_t sharedSecretBufferSize) {
-    command(CMD_SW_KYBER512_DEC);
+void PinataClient::mlkemDecode(const uint8_t *keyEncapsulationMessageBuffer, size_t keyEncapsulationMessageBufferSize,
+                               uint8_t *sharedSecretBuffer, size_t sharedSecretBufferSize) {
+    command(CMD_SW_MLKEM_DEC);
     write(keyEncapsulationMessageBuffer, keyEncapsulationMessageBufferSize);
     if (readNumber<uint8_t>() != 0) {
         throw std::runtime_error("failed to decode shared secret");
diff --git a/PinataTests/common.hpp b/PinataTests/common.hpp
index 4c1989b..d354e78 100644
--- a/PinataTests/common.hpp
+++ b/PinataTests/common.hpp
@@ -40,15 +40,15 @@ class PinataClient {
 
     std::pair<int, int> getVersion();
     FirmwareVariant determineFirmwareVariant();
-    std::pair<int, int> dilithiumGetKeySizes();
-    uint8_t dilithiumGetSecurityLevel();
-    void dilithiumSetPublicPrivateKeyPair(const uint8_t* publicKey, size_t publicKeySize, const uint8_t* privateKey, size_t privateKeySize);
-    void dilithiumSign(const uint8_t* messageBuffer, size_t messageBufferSize, uint8_t* signedMessageBuffer, size_t signedMessageBufferSize);
-    bool dilithiumVerify(const uint8_t* signatureBuffer, size_t signatureBufferSize);
-    std::pair<int, int> kyber512GetKeySizes();
-    void kyber512SetPublicPrivateKeyPair(const uint8_t* publicKey, size_t publicKeySize, const uint8_t* privateKey, size_t privateKeySize);
-    void kyber512Generate(uint8_t* sharedSecretBuffer, size_t sharedSecretBufferSize, uint8_t* keyEncapsulationMessageBuffer, size_t keyEncapsulationMessageBufferSize);
-    void kyber512Decode(const uint8_t* keyEncapsulationMessageBuffer, size_t keyEncapsulationMessageBufferSize, uint8_t* sharedSecretBuffer, size_t sharedSecretBufferSize);
+    std::pair<int, int> mldsaGetKeySizes();
+    uint8_t mldsaGetSecurityLevel();
+    void mldsaSetPublicPrivateKeyPair(const uint8_t* publicKey, size_t publicKeySize, const uint8_t* privateKey, size_t privateKeySize);
+    void mldsaSign(const uint8_t* messageBuffer, size_t messageBufferSize, uint8_t* signedMessageBuffer, size_t signedMessageBufferSize);
+    bool mldsaVerify(const uint8_t* signatureBuffer, size_t signatureBufferSize);
+    std::pair<int, int> mlkemGetKeySizes();
+    void mlkemSetPublicPrivateKeyPair(const uint8_t* publicKey, size_t publicKeySize, const uint8_t* privateKey, size_t privateKeySize);
+    void mlkemGenerate(uint8_t* sharedSecretBuffer, size_t sharedSecretBufferSize, uint8_t* keyEncapsulationMessageBuffer, size_t keyEncapsulationMessageBufferSize);
+    void mlkemDecode(const uint8_t* keyEncapsulationMessageBuffer, size_t keyEncapsulationMessageBufferSize, uint8_t* sharedSecretBuffer, size_t sharedSecretBufferSize);
     
     void doSymmetricCipherRequest(const uint8_t cmd, const uint8_t* input, const size_t inputSize, uint8_t* output,const size_t outputSize);
     void SWDESEncrypt(const uint8_t* plaintext, uint8_t* ciphertext);
diff --git a/README.md b/README.md
index 5bf7b1e..43afbc8 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 # Pinata board
 
 Pinata is a development board (ARM Cortex-M4F) that has been modified and programmed in order to be a training target 
-for Side Channel Analysis (SCA) and Fault Injection (FI) attacks.
+for Side Channel Analysis (SCA) and Fault Injection (FI) attacks. It is based on the STM32F4Discovery board.
 
 ## Features
 
@@ -77,18 +77,16 @@ in the manual).
 #### Lattice-based
 |                    |        | SW       | HW |
 |--------------------|--------|----------|----|
-| CRYSTALS-Dilithium |        |          | -  |
-|                    | LEVEL2 | -        | -  |
-|                    | LEVEL3 | SIG, VER | -  |
-|                    | LEVEL5 | -        | -  |
-| CRYSTALS-Kyber     |        |          |    |
+| ML-DSA FIPS 204    |        |          | -  |
+|                    |     44 | -        | -  |
+|                    |     65 | SIG, VER | -  |
+|                    |     87 | -        | -  |
+| MKL-KEM FIPS 203   |        |          |    |
 |                    |    512 | ENC, DEC | -  |
 |                    |    768 | -        | -  |
 |                    |   1024 | -        | -  |
-|                    | 512-90s| -        | -  |
-|                    | 768-90s| -        | -  |
-|                    |1024-90s| -        | -  |
 
+Note: ML-DSA and ML-KEM are implemented in terms of the [PQM4 library for Cortex-M4 processors](https://github.com/mupq/pqm4.git). The exact git commit hash that is used can be found in the src/CMakeLists.txt file. The library is downloaded into the $BUILD/\_deps/pqm4-src folder.
 
 ### Hash functions
 
@@ -113,39 +111,42 @@ Hardware only
 
 ## Building Pinata Firmware
 
-You can build pinata firmware either using wsl or on a native linux machine. The description bellow is based on an
-UBUNTU 22.04 machine. These steps will also work for wsl, but to get access to the Pinata board in wsl, see the 
-troubleshooting steps about Windows and wsl.
+You can build Pinata firmware either using WSL or on a native linux machine. The description below is based on an UBUNTU 22.04 machine. These steps will also work for WSL, but to get access to the Pinata board in WSL, see the troubleshooting steps about Windows and WSL.
 
 ### Requirements
 
-For cross - compiling the STM32F4Discovery board, you will need a `gcc-arm-none-eabi` toolchain and `cmake`
+For cross-compiling, and flashing, the STM32F4Discovery board, install the following packages:
 ```sh
-sudo apt-get install gcc-arm-none-eabi cmake
-```
-
-For flashing the STM32F4Discovery board, you will need the dfu-util toolkit:
-```sh
-sudo apt-get install dfu-util
+sudo apt-get install gcc-arm-none-eabi cmake dfu-util
 ```
 
 ### Cross-compiling the firmware
 
-For cross-compiling pinata:
+#### Configure
+
+Run the following command to configure the project for the gcc-arm-non-eabi compiler toolchain:
 
 ```sh
-cmake -DCMAKE_TOOLCHAIN_FILE=gcc-arm-none-eabi.toolchain.cmake -S. -Bbuild && cmake --build build
+cmake -DCMAKE_TOOLCHAIN_FILE=gcc-arm-none-eabi.toolchain.cmake -S . -B build
 ```
 
-This will compile all Pinata variations. Output binaries can be found in the `build` folder.
+This will create a ./build folder where you run your Makefile targets. You may customize the path to your compiler toolchain by definining a `PREFIX` variable on the command-line when invoking cmake. See the `gcc-arm-none-eabi.toolchain.cmake` toolchain file for details. (For regular Ubuntu/WSL installations, you don't need to modify the `PREFIX` variable).
+
+#### Build
+
+To build everything, just run `make` inside the configured ./build folder.
+
+This will compile all Pinata variations, which are currently "classic", "hw", and "pqc". Output binaries can be found in the `./build/src` folder.
+
+* The "classic" variant contains non-pqc software ciphers.
+* The "hw" variant contains non-pqc software ciphers, as well as _hardware-accelerated_ ciphers.
+* The "pqc" variant contains ML-DSA FIPS 204 and ML-KEM FIPS 203 software implementations.
 
 Example of compiling a particular firmware:
 
-```sh
- cmake --build build --target classic_bin
-```
+In general, there is a `help` Makefile target defined that you may invoke to view the possible targets to build.
 
-### Flashing the firmware
+### Flashing the firmware (Linux-based)
 
 Add a udev rule for the Pinata:
 
@@ -153,7 +154,7 @@ Add a udev rule for the Pinata:
 sudo mkdir -p /etc/udev/rules.d && echo 'SUBSYSTEM=="usb", ATTRS{idVendor}=="0483", MODE="0666", GROUP="plugdev"' | sudo tee /etc/udev/rules.d/69-pinata.rules
 ```
 
-Add user to the `plugdev` group:
+Add your user to the `plugdev` group:
 
 ```sh
 sudo usermod -a -G plugdev $USER
@@ -161,17 +162,31 @@ sudo usermod -a -G plugdev $USER
 
 Check if the physical Pinata is connected to the build machine using the micro USB port on the Pinata.
 
-Run the following command for flashing classic firmware
+Each firmware variant (classic, hw, pqc) has an associated "flash target" that allows one to flash the device while making sure the firmware is up-to-date with the source code. These special targets are named:
+
+* classic_flash
+* hw_flash
+* pqc_flash
+
+For example, run the following command for flashing the classic firmware onto the connected device:
 
 ```sh
-cmake --build build --target classic_flash
+make classic_flash
 ```
 
-Note that this command also makes sure the firmware binary is up-to-date, so for quick iteration loops you can just always run this after editing source code.
+This is assuming you configured with the _Unix Makefiles_ generator.
 
 ## Testing
 
-For more information on testing Pinata functionality, see [PinataTests/README.md](PinataTests/README.md).
+We maintain some integration tests for ensuring the ciphers on the device match reference implementations in the real world. For more information on testing Pinata functionality, see [PinataTests/README.md](PinataTests/README.md).
+
+## Usage
+
+The Pinata firmware works in a "request-response" manner where it waits for a command to appear via UART, optionally with arguments, then processes the command, and then optionally sends back a response.
+
+The available commands are described in the src/main.h file. In there, each `#define` line that starts with `CMD_` is a possible request. Each command is 1 byte, and the argument list for the command depends on the particular command. The arguments for the command are described in comments above the `#define` line.
+
+For the purposes of side-channel analysis, you are supposed to measure the voltage of the chip while the Pinata firmware is running a cryptographic operation. This has been made easy for you to do, because the interesting operations are wrapped in macro blocks named `BEGIN_INTERESTING_STUFF` and `END_INTERESTING_STUFF`. These macros will set GPIO Pin 2 to high and low, respectively. This allows you to trigger an oscilloscope on this GPIO pin and you'll know exactly where the interesting operation happens.
 
 ## Troubleshooting
 
diff --git a/patches/mldsa-sign.patch b/patches/mldsa-sign.patch
new file mode 100644
index 0000000..8b1b703
--- /dev/null
+++ b/patches/mldsa-sign.patch
@@ -0,0 +1,48 @@
+diff --git a/crypto_sign/ml-dsa-44/m4fstack/sign.c b/crypto_sign/ml-dsa-44/m4fstack/sign.c
+index a08d6d6..c50d2a5 100644
+--- a/crypto_sign/ml-dsa-44/m4fstack/sign.c
++++ b/crypto_sign/ml-dsa-44/m4fstack/sign.c
+@@ -11,6 +11,8 @@
+ 
+ #include "smallntt.h"
+ 
++#include "pinata_callbacks.h"
++
+ /*************************************************
+ * Name:        crypto_sign_keypair
+ *
+@@ -137,6 +139,7 @@ int crypto_sign_signature_ctx(uint8_t *sig,
+   uint16_t nonce = 0;
+   uint8_t wcomp[K][768];
+   uint8_t ccomp[68];
++  uint8_t PINATA_PATCH_rejected_once = 0;
+ 
+   union {
+     shake128incctx s128;
+@@ -227,6 +230,10 @@ rej:
+   poly_challenge_compress(ccomp, tmp0);
+   
+   /* Compute z, reject if it reveals secret */
++  if (PINATA_PATCH_rejected_once == 0) {
++    PINATA_PATCH_mldsa_start_callback();
++    PINATA_PATCH_rejected_once = 1;
++  }
+     for(size_t l_idx=0;l_idx < L; l_idx++){
+     if(l_idx != 0){
+       poly_challenge_decompress(tmp0, ccomp);
+@@ -240,11 +247,14 @@ rej:
+ 
+       poly_reduce(tmp0);
+ 
+-      if(poly_chknorm(tmp0, GAMMA1 - BETA))
++      if(poly_chknorm(tmp0, GAMMA1 - BETA)) {
++        // PINATA PATCH note: this makes the trigger variable-length! trigger on a falling edge.
+         goto rej;
++      }
+ 
+       polyz_pack(sig + CTILDEBYTES + l_idx*POLYZ_PACKEDBYTES, tmp0);
+   }
++  PINATA_PATCH_mldsa_finish_callback();
+ 
+ 
+   /* Write signature */
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index abd6265..b51ec60 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,6 +1,8 @@
 # Find programs that are used in flashing the firmware to the device.
 find_program(DFU_SUFFIX dfu-suffix)
 find_program(FLASHTOOL dfu-util)
+# Git is required for applying patches
+find_package(Git REQUIRED)
 
 if(NOT DFU_SUFFIX)
     message(STATUS "dfu-suffix was not found in $PATH, cannot create .dfu files")
@@ -49,6 +51,14 @@ target_include_directories(common PUBLIC
 target_compile_options(common PUBLIC -Werror)
 target_compile_definitions(common PUBLIC STM32F4XX __FPU_USED USE_STDPERIPH_DRIVER HAVE_C99INCLUDES)
 
+include(FetchContent)
+FetchContent_Declare(
+    pqm4
+    GIT_REPOSITORY https://github.com/mupq/pqm4.git
+    GIT_TAG a24bb4b662016968c19f5e6a0719c9ad530f0286
+)
+FetchContent_MakeAvailable(pqm4)
+
 # List of target names
 set(TARGETS classic hw pqc)
 
@@ -133,24 +143,23 @@ macro(add_licensed_subdir SUBDIR TARGETS SPDX_LICENSE_IDENTIFIER WEBSITE SOURCE_
     list(APPEND PINATA_LICENSED_SUBDIRS ${SUBDIR})
 endmacro()
 
-#                   Subdirectory       Targets      SPDX License           Informational Website                                       Source Code Origin
-add_licensed_subdir(swDES              "classic;hw" BSD-3-Clause-Clear     https://github.com/Riscure/Pinata                           https://github.com/Riscure/Pinata.git)
-add_licensed_subdir(swAES              "classic;hw" BSD-3-Clause-Clear     https://github.com/Riscure/Pinata                           https://github.com/Riscure/Pinata.git)
-add_licensed_subdir(swmAES             "classic;hw" BSD-3-Clause-Clear     https://github.com/Riscure/Pinata                           https://github.com/Riscure/Pinata.git)
-add_licensed_subdir(rsa                "classic;hw" BSD-3-Clause-Clear     https://github.com/Riscure/Pinata                           https://github.com/Riscure/Pinata.git)
-add_licensed_subdir(rsacrt             "classic;hw" BSD-3-Clause-Clear     https://github.com/Riscure/Pinata                           https://github.com/Riscure/Pinata.git)
-add_licensed_subdir(sm4                "classic;hw" "BSD-3-Clause;OpenSSL" https://en.wikipedia.org/wiki/SM4_\(cipher\)                https://raw.githubusercontent.com/openssl/openssl/704e8090b4a789f52af07de9a3ebbe11db8e19f8/crypto/sm4/sm4.c)
-add_licensed_subdir(swAES256           "classic;hw" MIT                    https://github.com/ilvn/aes256                              https://github.com/ilvn/aes256.git)
-add_licensed_subdir(swAES_Ttables      "classic;hw" CC0-1.0                http://www.efgh.com/software/rijndael.htm                   http://www.efgh.com/software/rijndael.txt)
-add_licensed_subdir(present            "classic;hw" BSD-3-Clause-Clear     https://github.com/Riscure/Pinata                           https://github.com/Riscure/Pinata.git)
-add_licensed_subdir(bignum             "classic;hw" MPL-2.0                https://www.di-mgt.com.au/bigdigits.html                    NOTFOUND)
-add_licensed_subdir(prng               "classic;hw" BSD-3-Clause-Clear     https://github.com/Riscure/Pinata                           https://github.com/Riscure/Pinata.git)
-add_licensed_subdir(ecc                "classic;hw" BSD-3-Clause-Clear     https://github.com/Riscure/Pinata                           https://github.com/Riscure/Pinata.git)
-add_licensed_subdir(curve25519_CortexM "classic;hw" CC0-1.0                https://munacl.cryptojedi.org/curve25519-cortexm0.shtml     https://munacl.cryptojedi.org/data/curve25519-cortexm0-20150813.tar.bz2)
-add_licensed_subdir(pqm4common         pqc          CC0-1.0                https://github.com/mupq/pqm4                                https://github.com/mupq/pqm4.git)
-add_licensed_subdir(dilithium          pqc          CC0-1.0                https://github.com/mupq/pqm4                                https://github.com/mupq/pqm4.git)
-add_licensed_subdir(kyber512           pqc          CC0-1.0                https://github.com/mupq/pqm4                                https://github.com/mupq/pqm4.git)
-#                   Subdirectory       Targets      SPDX License           Informational Website                                       Source Code Origin
+#                   Subdirectory       Targets      SPDX License           Informational Website                                   Source Code Origin
+add_licensed_subdir(swDES              "classic;hw" BSD-3-Clause-Clear     https://github.com/Riscure/Pinata                       https://github.com/Riscure/Pinata.git)
+add_licensed_subdir(swAES              "classic;hw" BSD-3-Clause-Clear     https://github.com/Riscure/Pinata                       https://github.com/Riscure/Pinata.git)
+add_licensed_subdir(swmAES             "classic;hw" BSD-3-Clause-Clear     https://github.com/Riscure/Pinata                       https://github.com/Riscure/Pinata.git)
+add_licensed_subdir(rsa                "classic;hw" BSD-3-Clause-Clear     https://github.com/Riscure/Pinata                       https://github.com/Riscure/Pinata.git)
+add_licensed_subdir(rsacrt             "classic;hw" BSD-3-Clause-Clear     https://github.com/Riscure/Pinata                       https://github.com/Riscure/Pinata.git)
+add_licensed_subdir(sm4                "classic;hw" "BSD-3-Clause;OpenSSL" https://en.wikipedia.org/wiki/SM4_\(cipher\)            https://raw.githubusercontent.com/openssl/openssl/704e8090b4a789f52af07de9a3ebbe11db8e19f8/crypto/sm4/sm4.c)
+add_licensed_subdir(swAES256           "classic;hw" MIT                    https://github.com/ilvn/aes256                          https://github.com/ilvn/aes256.git)
+add_licensed_subdir(swAES_Ttables      "classic;hw" CC0-1.0                http://www.efgh.com/software/rijndael.htm               http://www.efgh.com/software/rijndael.txt)
+add_licensed_subdir(present            "classic;hw" BSD-3-Clause-Clear     https://github.com/Riscure/Pinata                       https://github.com/Riscure/Pinata.git)
+add_licensed_subdir(bignum             "classic;hw" MPL-2.0                https://www.di-mgt.com.au/bigdigits.html                NOTFOUND)
+add_licensed_subdir(prng               "classic;hw" BSD-3-Clause-Clear     https://github.com/Riscure/Pinata                       https://github.com/Riscure/Pinata.git)
+add_licensed_subdir(ecc                "classic;hw" BSD-3-Clause-Clear     https://github.com/Riscure/Pinata                       https://github.com/Riscure/Pinata.git)
+add_licensed_subdir(curve25519_CortexM "classic;hw" CC0-1.0                https://munacl.cryptojedi.org/curve25519-cortexm0.shtml https://munacl.cryptojedi.org/data/curve25519-cortexm0-20150813.tar.bz2)
+add_licensed_subdir(mldsa              pqc          CC0-1.0                https://github.com/mupq/pqm4                            https://github.com/mupq/pqm4.git)
+add_licensed_subdir(mlkem              pqc          CC0-1.0                https://github.com/mupq/pqm4                            https://github.com/mupq/pqm4.git)
+#                   Subdirectory       Targets      SPDX License           Informational Website                                   Source Code Origin
 
 add_licensed_subdir(
     tea
@@ -160,11 +169,80 @@ add_licensed_subdir(
     "https://en.wikipedia.org/wiki/Tiny_Encryption_Algorithm#Reference_code;https://en.wikipedia.org/wiki/XTEA#Implementations"
 )
 
-# Minor tweaks for the firmware targets.
+# Tweaks for the firmware targets.
 target_compile_definitions(hw PRIVATE HW_CRYPTO_PRESENT)
-target_include_directories(pqc PRIVATE pqm4common)
+set(mldsa_base_dir "${pqm4_SOURCE_DIR}/crypto_sign/ml-dsa-65/m4fstack")
+set(mlkem_base_dir "${pqm4_SOURCE_DIR}/crypto_kem/ml-kem-512/m4fstack")
+set(mupq_common_dir "${pqm4_SOURCE_DIR}/mupq/common")
+set(mupq_common_source_files
+    "${mupq_common_dir}/fips202.c"
+    "${mupq_common_dir}/keccakf1600.c"
+    "${mupq_common_dir}/nistseedexpander.c"
+)
+set(mldsa_source_files
+    "${mldsa_base_dir}/ntt.S"
+    "${mldsa_base_dir}/packing.c"
+    "${mldsa_base_dir}/pointwise_mont.s"
+    "${mldsa_base_dir}/poly.c"
+    "${mldsa_base_dir}/polyvec.c"
+    "${mldsa_base_dir}/rounding.c"
+    "${mldsa_base_dir}/sign.c"
+    "${mldsa_base_dir}/smallntt_769.S"
+    "${mldsa_base_dir}/smallpoly.c"
+    "${mldsa_base_dir}/stack.c"
+    "${mldsa_base_dir}/symmetric-shake.c"
+    "${mldsa_base_dir}/vector.s"
+)
+set(mlkem_source_files
+    "${mlkem_base_dir}/cbd.c"
+    "${mlkem_base_dir}/cmov_int16.S"
+    "${mlkem_base_dir}/fastaddsub.S"
+    "${mlkem_base_dir}/fastbasemul.S"
+    "${mlkem_base_dir}/fastinvntt.S"
+    "${mlkem_base_dir}/fastntt.S"
+    "${mlkem_base_dir}/indcpa.c"
+    "${mlkem_base_dir}/kem.c"
+    "${mlkem_base_dir}/matacc.c"
+    "${mlkem_base_dir}/matacc_asm.S"
+    "${mlkem_base_dir}/ntt.c"
+    "${mlkem_base_dir}/poly.c"
+    "${mlkem_base_dir}/poly_asm.S"
+    "${mlkem_base_dir}/polyvec.c"
+    "${mlkem_base_dir}/reduce.S"
+    "${mlkem_base_dir}/symmetric-fips202.c"
+    "${mlkem_base_dir}/verify.c"
+)
+target_sources(pqc PRIVATE pqm4_hal/randombytes.c pqm4_hal/pinata_callbacks.c ${mupq_common_source_files} ${mldsa_source_files} ${mlkem_source_files})
+set_source_files_properties(mldsa/wrapper.c PROPERTIES INCLUDE_DIRECTORIES "${CMAKE_CURRENT_SOURCE_DIR}/pqm4_hal;${mldsa_base_dir}")
+set_source_files_properties(mlkem/wrapper.c PROPERTIES INCLUDE_DIRECTORIES "${CMAKE_CURRENT_SOURCE_DIR}/pqm4_hal;${mlkem_base_dir}")
+set_source_files_properties(${mldsa_source_files} PROPERTIES INCLUDE_DIRECTORIES "${CMAKE_CURRENT_SOURCE_DIR}/pqm4_hal;${mupq_common_dir}")
+set_source_files_properties(${mlkem_source_files} PROPERTIES INCLUDE_DIRECTORIES "${CMAKE_CURRENT_SOURCE_DIR}/pqm4_hal;${mupq_common_dir}")
 target_compile_definitions(pqc PRIVATE VARIANT_PQC $<$<BOOL:${RANDOM_SIGNING}>:DILITHIUM_RANDOMIZED_SIGNING>)
 
+# For the short triggering in ML-DSA sign, we need to modify the sign.c source a bit.
+# We use a patch file for that. Apply that patch file as part of the build.
+add_custom_command(
+    OUTPUT
+        "${CMAKE_CURRENT_BINARY_DIR}/mldsa-sign.patch.applied"
+    COMMAND
+        "${GIT_EXECUTABLE}" checkout .
+    COMMAND
+        "${GIT_EXECUTABLE}" apply "${CMAKE_CURRENT_SOURCE_DIR}/../patches/mldsa-sign.patch"
+    COMMAND
+        "${CMAKE_COMMAND}" -E touch "${CMAKE_CURRENT_BINARY_DIR}/mldsa-sign.patch.applied"
+    WORKING_DIRECTORY
+        "${pqm4_SOURCE_DIR}"
+    COMMENT
+        "Applying patch ${CMAKE_CURRENT_SOURCE_DIR}/../patches/mldsa-sign.patch"
+)
+
+add_custom_target(apply-mldsa-sign-patch
+    DEPENDS
+        "${CMAKE_CURRENT_BINARY_DIR}/mldsa-sign.patch.applied"
+)
+
+add_dependencies(pqc apply-mldsa-sign-patch)
+
 # After having collected all license information into lists, we now generate
 # the notice file.
 set(NOTICE_FILE "${CMAKE_CURRENT_BINARY_DIR}/ThirdPartyLicenses.txt")
diff --git a/src/dilithium/CMakeLists.txt b/src/dilithium/CMakeLists.txt
deleted file mode 100644
index 073dbb5..0000000
--- a/src/dilithium/CMakeLists.txt
+++ /dev/null
@@ -1,30 +0,0 @@
-target_licensed_sources(
-  packing.c
-  polyvec.c
-  api.h
-  pointwise_mont.s
-  sign.h
-  smallpoly.h
-  vector.h
-  ntt.S
-  config.h
-  poly.c
-  symmetric.h
-  polyvec.h
-  poly.h
-  rounding.h
-  params.h
-  wrapper.h
-  ntt.h
-  pointwise_mont.h
-  rounding.c
-  vector.s
-  wrapper.c
-  packing.h
-  smallntt.S
-  reduce.h
-  smallntt.h
-  sign.c
-  symmetric-shake.c
-  smallpoly.c
-)
diff --git a/src/dilithium/README.md b/src/dilithium/README.md
deleted file mode 100644
index a2a2a33..0000000
--- a/src/dilithium/README.md
+++ /dev/null
@@ -1,10 +0,0 @@
-The code here is based on
-
-https://github.com/mupq/pqm4
-commit: 992f0f226503d43b6d33278ecb60a9168ed8d787
-
-The above commit seems to most closely match "NIST Submission Round 3" of the reference implementation. The reference implementation can be found at
-
-https://github.com/pq-crystals/dilithium
-
-There seems to be no git tag for Round 3. However, there is a git tag for "v3.1". The "NIST Submission Round 3" seems to correspond to the code just before "v3.1".
diff --git a/src/dilithium/api.h b/src/dilithium/api.h
deleted file mode 100644
index a289632..0000000
--- a/src/dilithium/api.h
+++ /dev/null
@@ -1,26 +0,0 @@
-#ifndef API_H
-#define API_H
-
-#include <stddef.h>
-#include <stdint.h>
-#include "params.h"
-
-int crypto_sign_keypair(uint8_t *pk, uint8_t *sk);
-
-int crypto_sign_signature(uint8_t *sig, size_t *siglen,
-                          const uint8_t *m, size_t mlen,
-                          const uint8_t *sk);
-
-int crypto_sign(uint8_t *sm, size_t *smlen,
-                const uint8_t *m, size_t mlen,
-                const uint8_t *sk);
-
-int crypto_sign_verify(const uint8_t *sig, size_t siglen,
-                       const uint8_t *m, size_t mlen,
-                       const uint8_t *pk);
-
-int crypto_sign_open(uint8_t *m, size_t *mlen,
-                     const uint8_t *sm, size_t smlen,
-                     const uint8_t *pk);
-
-#endif
diff --git a/src/dilithium/config.h b/src/dilithium/config.h
deleted file mode 100644
index 5572407..0000000
--- a/src/dilithium/config.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef CONFIG_H
-#define CONFIG_H
-
-#define DILITHIUM_MODE 3
-// #define SIGN_STACKSTRATEGY 2
-
-#endif
diff --git a/src/dilithium/macros.i b/src/dilithium/macros.i
deleted file mode 100644
index 25d98c2..0000000
--- a/src/dilithium/macros.i
+++ /dev/null
@@ -1,191 +0,0 @@
-#ifndef MACROS_I
-#define MACROS_I
-// 3
-.macro montgomery_mul_32 a, b, Qprime, Q, tmp, tmp2
-    smull \tmp, \a, \a, \b
-    mul \tmp2, \tmp, \Qprime
-    smlal \tmp, \a, \tmp2, \Q
-.endm
-
-// 2
-.macro addSub1 c0, c1
-    add.w \c0, \c1
-    sub.w \c1, \c0, \c1, lsl #1
-.endm
-
-// 3
-.macro addSub2 c0, c1, c2, c3
-    add \c0, \c1
-    add \c2, \c3
-    sub.w \c1, \c0, \c1, lsl #1
-    sub.w \c3, \c2, \c3, lsl #1
-.endm
-
-// 6
-.macro addSub4 c0, c1, c2, c3, c4, c5, c6, c7
-    add \c0, \c1
-    add \c2, \c3
-    add \c4, \c5
-    add \c6, \c7
-    sub.w \c1, \c0, \c1, lsl #1
-    sub.w \c3, \c2, \c3, lsl #1
-    sub.w \c5, \c4, \c5, lsl #1
-    sub.w \c7, \c6, \c7, lsl #1
-.endm
-
-.macro _2_layer_CT_32 c0, c1, c2, c3, zeta0, zeta1, zeta2, Qprime, Q, tmp, tmp2
-    montgomery_mul_32 \c2, \zeta0, \Qprime, \Q, \tmp, \tmp2
-    montgomery_mul_32 \c3, \zeta0, \Qprime, \Q, \tmp, \tmp2
-    addSub2 \c0, \c2, \c1, \c3
-
-    montgomery_mul_32 \c1, \zeta1, \Qprime, \Q, \tmp, \tmp2
-    montgomery_mul_32 \c3, \zeta2, \Qprime, \Q, \tmp, \tmp2
-    addSub2 \c0, \c1, \c2, \c3
-.endm
-
-.macro _2_layer_inv_CT_32 c0, c1, c2, c3, zeta0, zeta1, zeta2, Qprime, Q, tmp, tmp2
-    montgomery_mul_32 \c1, \zeta0, \Qprime, \Q, \tmp, \tmp2
-    montgomery_mul_32 \c3, \zeta0, \Qprime, \Q, \tmp, \tmp2
-    addSub2 \c0, \c1, \c2, \c3
-
-    montgomery_mul_32 \c2, \zeta1, \Qprime, \Q, \tmp, \tmp2
-    montgomery_mul_32 \c3, \zeta2, \Qprime, \Q, \tmp, \tmp2
-    addSub2 \c0, \c2, \c1, \c3
-.endm
-
-.macro _3_layer_CT_32 c0, c1, c2, c3, c4, c5, c6, c7, xi0, xi1, xi2, xi3, xi4, xi5, xi6, twiddle, Qprime, Q, tmp, tmp2
-    vmov.w \twiddle, \xi0
-    montgomery_mul_32 \c4, \twiddle, \Qprime, \Q, \tmp, \tmp2
-    montgomery_mul_32 \c5, \twiddle, \Qprime, \Q, \tmp, \tmp2
-    montgomery_mul_32 \c6, \twiddle, \Qprime, \Q, \tmp, \tmp2
-    montgomery_mul_32 \c7, \twiddle, \Qprime, \Q, \tmp, \tmp2
-    addSub4 \c0, \c4, \c1, \c5, \c2, \c6, \c3, \c7
-
-    vmov.w \twiddle, \xi1
-    montgomery_mul_32 \c2, \twiddle, \Qprime, \Q, \tmp, \tmp2
-    montgomery_mul_32 \c3, \twiddle, \Qprime, \Q, \tmp, \tmp2
-    vmov.w \twiddle, \xi2
-    montgomery_mul_32 \c6, \twiddle, \Qprime, \Q, \tmp, \tmp2
-    montgomery_mul_32 \c7, \twiddle, \Qprime, \Q, \tmp, \tmp2
-    addSub4 \c0, \c2, \c1, \c3, \c4, \c6, \c5, \c7
-
-    vmov.w \twiddle, \xi3
-    montgomery_mul_32 \c1, \twiddle, \Qprime, \Q, \tmp, \tmp2
-    vmov.w \twiddle, \xi4
-    montgomery_mul_32 \c3, \twiddle, \Qprime, \Q, \tmp, \tmp2
-    vmov.w \twiddle, \xi5
-    montgomery_mul_32 \c5, \twiddle, \Qprime, \Q, \tmp, \tmp2
-    vmov.w \twiddle, \xi6
-    montgomery_mul_32 \c7, \twiddle, \Qprime, \Q, \tmp, \tmp2
-    addSub4 \c0, \c1, \c2, \c3, \c4, \c5, \c6, \c7
-.endm
-
-.macro _3_layer_inv_CT_32 c0, c1, c2, c3, c4, c5, c6, c7, xi0, xi1, xi2, xi3, xi4, xi5, xi6, twiddle, Qprime, Q, tmp, tmp2
-    vmov.w \twiddle, \xi0
-    montgomery_mul_32 \c1, \twiddle, \Qprime, \Q, \tmp, \tmp2
-    montgomery_mul_32 \c3, \twiddle, \Qprime, \Q, \tmp, \tmp2
-    montgomery_mul_32 \c5, \twiddle, \Qprime, \Q, \tmp, \tmp2
-    montgomery_mul_32 \c7, \twiddle, \Qprime, \Q, \tmp, \tmp2
-    addSub4 \c0, \c1, \c2, \c3, \c4, \c5, \c6, \c7
-
-    vmov.w \twiddle, \xi1
-    montgomery_mul_32 \c2, \twiddle, \Qprime, \Q, \tmp, \tmp2
-    montgomery_mul_32 \c6, \twiddle, \Qprime, \Q, \tmp, \tmp2
-    vmov.w \twiddle, \xi2
-    montgomery_mul_32 \c3, \twiddle, \Qprime, \Q, \tmp, \tmp2
-    montgomery_mul_32 \c7, \twiddle, \Qprime, \Q, \tmp, \tmp2
-    addSub4 \c0, \c2, \c1, \c3, \c4, \c6, \c5, \c7
-
-    vmov.w \twiddle, \xi3
-    montgomery_mul_32 \c4, \twiddle, \Qprime, \Q, \tmp, \tmp2
-    vmov.w \twiddle, \xi4
-    montgomery_mul_32 \c5, \twiddle, \Qprime, \Q, \tmp, \tmp2
-    vmov.w \twiddle, \xi5
-    montgomery_mul_32 \c6, \twiddle, \Qprime, \Q, \tmp, \tmp2
-    vmov.w \twiddle, \xi6
-    montgomery_mul_32 \c7, \twiddle, \Qprime, \Q, \tmp, \tmp2
-    addSub4 \c0, \c4, \c1, \c5, \c2, \c6, \c3, \c7
-.endm
-
-/************************************************************
-* Name:         _3_layer_inv_butterfly_light_fast_first
-*
-* Description:  upper half of 3-layer inverse butterfly
-*               defined over X^8 - 1
-*
-* Input:        (c4, c1, c6, c3) = coefficients on the upper half;
-*               (xi0, xi1, xi2, xi3, xi4, xi5, xi6) =
-*               (  1,  1,  w_4,   1, w_8, w_4, w_8^3) in
-*               Montgomery domain
-*
-* Symbols:      R = 2^32
-*
-* Constants:    Qprime = -MOD^{-1} mod^{+-} R, Q = MOD
-*
-* Output:
-*               c4 =  c4 + c1        + (c6 + c3)
-*               c5 = (c4 - c1) w_4   + (c6 + c3) w_8^3
-*               c6 =  c4 + c1        - (c6 + c3)
-*               c7 = (c4 - c1) w_8^3 + (c6 + c3) w_4
-************************************************************/
-// 15
-.macro _3_layer_inv_butterfly_light_fast_first c0, c1, c2, c3, c4, c5, c6, c7, xi0, xi1, xi2, xi3, xi4, xi5, xi6, twiddle, Qprime, Q, tmp, tmp2
-    addSub2 \c4, \c1, \c6, \c3
-    addSub1 \c4, \c6
-
-    vmov.w \tmp, \xi4
-    vmov.w \tmp2, \xi6
-
-    smull.w \c0, \c5, \c1, \tmp
-    smlal.w \c0, \c5, \c3, \tmp2
-    mul.w \twiddle, \c0, \Qprime
-    smlal.w \c0, \c5, \twiddle, \Q
-
-    smull.w \c2, \c7, \c1, \tmp2
-    smlal.w \c2, \c7, \c3, \tmp
-    mul.w \twiddle, \c2, \Qprime
-    smlal.w \c2, \c7, \twiddle, \Q
-.endm
-
-/************************************************************
-* Name:         _3_layer_inv_butterfly_light_fast_second
-*
-* Description:  lower half of 3-layer inverse butterfly
-*               defined over X^8 - 1, and the 2nd
-*               layer of butterflies
-*
-* Input:
-*               (c4, c5, c6, c7) = results of the upper half;
-*               (c0, c1, c2, c3) = coefficients on the lower half;
-*               (xi0, xi1, xi2, xi3, xi4, xi5, xi6) =
-*               (  1,  1,  w_4,   1, w_8, w_4, w_8^3) in
-*               Montgomery domain
-*
-* Symbols:      R = 2^32
-*
-* Constants:    Qprime = -MOD^{-1} mod^{+-} R, Q = MOD
-*
-* Output:       (normal order)
-*               c0 =   c0 + c1     + (c2 + c3)         + (  c4 + c5     + (c6 + c7)       )
-*               c1 =  (c0 - c1) w3 + (c2 - c3)  w4     + ( (c4 - c5) w5 + (c6 - c7) w6    )
-*               c2 = ( c0 + c1     - (c2 + c3)) w1     + (( c4 + c5     - (c6 + c7)   ) w2)
-*               c3 = ((c0 - c1) w3 - (c2 - c3)  w4) w1 + (((c4 - c5) w5 - (c6 - c7) w6) w2)
-*               c4 =   c0 + c1     - (c2 + c3)         - (  c4 + c5     + (c6 + c7)       ) w0
-*               c5 =  (c0 - c1) w3 + (c2 - c3)  w4     - ( (c4 - c5) w5 + (c6 - c7) w6    ) w0
-*               c6 = ( c0 + c1     - (c2 + c3)) w1     - (( c4 + c5     - (c6 + c7)   ) w2) w0
-*               c7 = ((c0 - c1) w3 - (c2 - c3)  w4) w1 - (((c4 - c5) w5 - (c6 - c7) w6) w2) w0
-************************************************************/
-// 19
-.macro _3_layer_inv_butterfly_light_fast_second c0, c1, c2, c3, c4, c5, c6, c7, xi0, xi1, xi2, xi3, xi4, xi5, xi6, twiddle, Qprime, Q, tmp, tmp2
-    addSub2 \c0, \c1, \c2, \c3
-
-    vmov.w \twiddle, \xi2
-    montgomery_mul_32 \c3, \twiddle, \Qprime, \Q, \tmp, \tmp2
-    addSub2 \c0, \c2, \c1, \c3
-
-    montgomery_mul_32 \c6, \twiddle, \Qprime, \Q, \tmp, \tmp2
-
-    addSub4 \c0, \c4, \c1, \c5, \c2, \c6, \c3, \c7
-.endm
-
-#endif /* MACROS_I */
diff --git a/src/dilithium/macros_fnt.i b/src/dilithium/macros_fnt.i
deleted file mode 100644
index 25903e4..0000000
--- a/src/dilithium/macros_fnt.i
+++ /dev/null
@@ -1,158 +0,0 @@
-// 2
-.macro ldrstr2 ldrstr, target, c0, c1, mem0, mem1
-    \ldrstr \c0, [\target, \mem0]
-    \ldrstr \c1, [\target, \mem1]
-.endm
-
-// 2
-.macro ldrstr2jump ldrstr, target, c0, c1, mem1, jump
-    \ldrstr \c1, [\target, \mem1]
-    \ldrstr \c0, [\target], \jump
-.endm
-
-// 4
-.macro ldrstr4 ldrstr, target, c0, c1, c2, c3, mem0, mem1, mem2, mem3
-    \ldrstr \c0, [\target, \mem0]
-    \ldrstr \c1, [\target, \mem1]
-    \ldrstr \c2, [\target, \mem2]
-    \ldrstr \c3, [\target, \mem3]
-.endm
-
-// 4
-.macro ldrstr4jump ldrstr, target, c0, c1, c2, c3, mem1, mem2, mem3, jump
-    \ldrstr \c1, [\target, \mem1]
-    \ldrstr \c2, [\target, \mem2]
-    \ldrstr \c3, [\target, \mem3]
-    \ldrstr \c0, [\target], \jump
-.endm
-
-// 8
-.macro ldrstrvec ldrstr, target, c0, c1, c2, c3, c4, c5, c6, c7, mem0, mem1, mem2, mem3, mem4, mem5, mem6, mem7
-    ldrstr4 \ldrstr, \target, \c0, \c1, \c2, \c3, \mem0, \mem1, \mem2, \mem3
-    ldrstr4 \ldrstr, \target, \c4, \c5, \c6, \c7, \mem4, \mem5, \mem6, \mem7
-.endm
-
-// 8
-.macro ldrstrvecjump ldrstr, target, c0, c1, c2, c3, c4, c5, c6, c7, mem1, mem2, mem3, mem4, mem5, mem6, mem7, jump
-    ldrstr4 \ldrstr, \target, \c4, \c5, \c6, \c7, \mem4, \mem5, \mem6, \mem7
-    ldrstr4jump \ldrstr, \target, \c0, \c1, \c2, \c3, \mem1, \mem2, \mem3, \jump
-.endm
-
-
-
-.macro addSub1 c0, c1
-    add.w \c0, \c1
-    sub.w \c1, \c0, \c1, lsl #1
-.endm
-
-.macro addSub2 c0, c1, c2, c3
-    add \c0, \c1
-    add \c2, \c3
-    sub.w \c1, \c0, \c1, lsl #1
-    sub.w \c3, \c2, \c3, lsl #1
-.endm
-
-.macro addSub4 c0, c1, c2, c3, c4, c5, c6, c7
-    add \c0, \c1
-    add \c2, \c3
-    add \c4, \c5
-    add \c6, \c7
-    sub.w \c1, \c0, \c1, lsl #1
-    sub.w \c3, \c2, \c3, lsl #1
-    sub.w \c5, \c4, \c5, lsl #1
-    sub.w \c7, \c6, \c7, lsl #1
-.endm
-
-// 2
-.macro barrett_32 a, Qbar, Q, tmp
-    smmulr.w \tmp, \a, \Qbar
-    mls.w \a, \tmp, \Q, \a
-.endm
-
-.macro FNT_CT_butterfly c0, c1, logW
-    add.w \c0, \c0, \c1, lsl #\logW
-    sub.w \c1, \c0, \c1, lsl #(\logW+1)
-.endm
-
-.macro shift_subAdd c0, c1, shlv
-    sub.w \c0, \c0, \c1, lsl #(\shlv)
-    add.w \c1, \c0, \c1, lsl #(\shlv+1)
-.endm
-
-.macro FNT_CT_ibutterfly c0, c1, shlv
-    shift_subAdd \c0, \c1, \shlv
-.endm
-
-// 46
-.macro _3_layer_CT_32_FNT c0, c1, c2, c3, c4, c5, c6, c7, xi0, xi1, xi2, xi3, xi4, xi5, xi6, twiddle, Qprime, Q, tmp, tmp2
-    vmov.w \twiddle, \xi0
-
-    // c0, c1, c2, c3, c4, c5, c6, c7, c8
-    // 0,4
-    mla \tmp, \c4, \twiddle, \c0
-    mls \c4, \c4, \twiddle, \c0
-
-    // 1,5
-    mla \c0, \c5, \twiddle, \c1
-    mls \c5, \c5, \twiddle, \c1
-
-    // 2,6
-    mla \c1, \c6, \twiddle, \c2
-    mls \c6, \c6, \twiddle, \c2
-
-    // 3,7
-    mla \c2, \c7, \twiddle, \c3
-    mls \c7, \c7, \twiddle, \c3
-
-    // tmp, c0, c1, c2, c4, c5, c6, c7
-
-    barrett_32 \tmp, \Qprime, \Q, \c3
-    barrett_32 \c0, \Qprime, \Q, \c3
-    barrett_32 \c1, \Qprime, \Q, \c3
-    barrett_32 \c2, \Qprime, \Q, \c3
-    barrett_32 \c4, \Qprime, \Q, \c3
-    barrett_32 \c5, \Qprime, \Q, \c3
-    barrett_32 \c6, \Qprime, \Q, \c3
-    barrett_32 \c7, \Qprime, \Q, \c3
-
-    vmov.w \twiddle, \xi1
-    // 0,2
-    mla \tmp2, \c1, \twiddle, \tmp
-    mls \c3, \c1, \twiddle, \tmp
-
-    // 1,3
-    mla \tmp, \c2, \twiddle, \c0
-    mls \c0, \c2, \twiddle, \c0
-
-    vmov.w \twiddle, \xi2
-
-    // 4,6
-    mla \c2, \c6, \twiddle, \c4
-    mls \c1, \c6, \twiddle, \c4
-
-    // 5,7
-    mla \c6, \c7, \twiddle, \c5
-    mls \c7, \c7, \twiddle, \c5
-
-    // tmp2, tmp, c3, c0 | c2, c6, c1, c7
-
-    // 4,5
-    vmov.w \twiddle, \xi5
-    mla \c4, \c6, \twiddle, \c2
-    mls \c5, \c6, \twiddle, \c2
-
-    // 6,7
-    vmov.w \twiddle, \xi6
-    mla \c6, \c7, \twiddle, \c1
-    mls \c7, \c7, \twiddle, \c1
-
-    // 2,3
-    vmov.w \twiddle, \xi4
-    mla \c2, \c0, \twiddle, \c3
-    mls \c3, \c0, \twiddle, \c3
-
-    // 0,1
-    vmov.w \twiddle, \xi3
-    mla \c0, \tmp, \twiddle, \tmp2
-    mls \c1, \tmp, \twiddle, \tmp2
-.endm
\ No newline at end of file
diff --git a/src/dilithium/ntt.S b/src/dilithium/ntt.S
deleted file mode 100644
index 53a36bc..0000000
--- a/src/dilithium/ntt.S
+++ /dev/null
@@ -1,582 +0,0 @@
-// This program is free software: you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation, either version 3 of the License, or
-// (at your option) any later version.
-//
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with this program.  If not, see <https://www.gnu.org/licenses/>.
-
-
-// author: Markus Krausz
-// date: 18.03.18
-
-// reuse of the NTT zeta table also for NTT^-1
-// added by Marco Palumbi (marco@palumbi.it)
-// date: 21.12.20
-
-.syntax unified
-
-
-// This code uses UMULL - which is constant time on the M4, but not on the M3
-// Make sure that this code is never used on an M3
-smlad r0,r0,r0,r0
-
-// ##############################
-// ##########   NTT    ##########
-// ##############################
-
-//CT butterfly with Montgomery reduction  -- SIGNED
-.macro ct_butterfly_montg pol0, pol1, zeta, q, qinv, th, tl
-  smull \tl, \th, \pol1, \zeta
-  mul \pol1, \tl, \qinv  // q is -qinv
-  smlal \tl, \th, \pol1, \q
-  sub \pol1, \pol0, \th
-  add.w \pol0, \pol0, \th
-.endm
-
-//void pqcrystals_dilithium_ntt(int32_t p[N]);
-.global pqcrystals_dilithium_ntt
-.type pqcrystals_dilithium_ntt,%function
-.align 2
-pqcrystals_dilithium_ntt:
-  //bind aliases
-  ptr_p     .req R0
-  ptr_zeta  .req R1
-  qinv      .req R2
-  q         .req R3
-  cntr      .req R4
-  pol0      .req R5
-  pol1      .req R6
-  pol2      .req R7
-  pol3      .req R8
-  temp_h    .req R9
-  temp_l    .req R10
-  zeta0     .req R11
-  zeta1     .req R12
-  zeta2     .req R14
-
-  //preserve registers
-  push {R4-R11, R14}
-  ldr ptr_zeta, =#zetas_interleaved_asm
-  add ptr_zeta, #4
-  //load constants, ptr
-  ldr.w qinv, inv_ntt_asm_smull_qinv  //-qinv_signed
-  ldr.w q, inv_ntt_asm_smull_q
-  //stage 1 and 2
-  ldr.w cntr, inv_ntt_asm_smull_64
-
-  ldr zeta1, [ptr_zeta, #4]  //z2
-  ldr zeta2, [ptr_zeta, #8]  //z3
-  ldr zeta0, [ptr_zeta], #12  //z1
-  1:
-    ldr.w pol0, [ptr_p]
-    ldr pol1, [ptr_p, #256]  //64*4
-    ldr pol2, [ptr_p, #512]  //128*4
-    ldr pol3, [ptr_p, #768]  //192*4
-    ct_butterfly_montg pol0, pol2, zeta0, q, qinv, temp_h, temp_l  //stage1
-    ct_butterfly_montg pol1, pol3, zeta0, q, qinv, temp_h, temp_l  //stage1
-    ct_butterfly_montg pol0, pol1, zeta1, q, qinv, temp_h, temp_l  //stage2
-    ct_butterfly_montg pol2, pol3, zeta2, q, qinv, temp_h, temp_l  //stage2
-
-    str pol1, [ptr_p, #256]
-    str pol2, [ptr_p, #512]
-    str pol3, [ptr_p, #768]
-    str pol0, [ptr_p], #4
-    subs cntr, #1
-    bne 1b
-  sub ptr_p, #256      // on pol0 again
-
-  //stage 3 and 4
-  movw cntr, #16
-  ldr zeta1, [ptr_zeta, #4]  //z8
-  ldr zeta2, [ptr_zeta, #8]  //z9
-  ldr zeta0, [ptr_zeta], #12  //z4
-  1:
-    ldr.w pol0, [ptr_p]  //16*4
-    ldr.w pol1, [ptr_p, #64]
-    ldr.w pol2, [ptr_p, #128]
-    ldr.w pol3, [ptr_p, #192]
-    ct_butterfly_montg pol0, pol2, zeta0, q, qinv, temp_h, temp_l  //stage3
-    ct_butterfly_montg pol1, pol3, zeta0, q, qinv, temp_h, temp_l  //stage3
-    ct_butterfly_montg pol0, pol1, zeta1, q, qinv, temp_h, temp_l  //stage4
-    ct_butterfly_montg pol2, pol3, zeta2, q, qinv, temp_h, temp_l  //stage4
-
-    str.w pol1, [ptr_p, #64]
-    str.w pol2, [ptr_p, #128]
-    str.w pol3, [ptr_p, #192]
-    str pol0, [ptr_p], #4
-    subs cntr, #1
-    bne 1b
-  add.w ptr_p, ptr_p, #192      //(64-16)*4
-
-  movw cntr, #16
-  ldr zeta1, [ptr_zeta, #4]  //z10
-  ldr zeta2, [ptr_zeta, #8]  //z11
-  ldr zeta0, [ptr_zeta], #12  //z5
-  1:
-    ldr.w pol0, [ptr_p]
-    ldr.w pol1, [ptr_p, #64]
-    ldr.w pol2, [ptr_p, #128]
-    ldr.w pol3, [ptr_p, #192]
-    ct_butterfly_montg pol0, pol2, zeta0, q, qinv, temp_h, temp_l  //stage3
-    ct_butterfly_montg pol1, pol3, zeta0, q, qinv, temp_h, temp_l  //stage3
-    ct_butterfly_montg pol0, pol1, zeta1, q, qinv, temp_h, temp_l  //stage4
-    ct_butterfly_montg pol2, pol3, zeta2, q, qinv, temp_h, temp_l  //stage4
-
-    str.w pol1, [ptr_p, #64]
-    str.w pol2, [ptr_p, #128]
-    str.w pol3, [ptr_p, #192]  //(16*3-1)*4
-    str pol0, [ptr_p], #4
-    subs cntr, #1
-    bne 1b
-  add ptr_p, ptr_p, #192      //(64-16)*4
-
-  movw cntr, #16
-
-  ldr.w zeta1, [ptr_zeta, #4]  //z12
-  ldr.w zeta2, [ptr_zeta, #8]  //z13
-  ldr zeta0, [ptr_zeta], #12  //z6
-  1:
-    ldr.w pol0, [ptr_p]  //16*4
-    ldr.w pol1, [ptr_p, #64]
-    ldr.w pol2, [ptr_p, #128]
-    ldr.w pol3, [ptr_p, #192]  //(16*3)*4
-    ct_butterfly_montg pol0, pol2, zeta0, q, qinv, temp_h, temp_l  //stage3
-    ct_butterfly_montg pol1, pol3, zeta0, q, qinv, temp_h, temp_l  //stage3
-    ct_butterfly_montg pol0, pol1, zeta1, q, qinv, temp_h, temp_l  //stage4
-    ct_butterfly_montg pol2, pol3, zeta2, q, qinv, temp_h, temp_l  //stage4
-    str.w pol1, [ptr_p, #64]
-    str.w pol2, [ptr_p, #128]
-    str.w pol3, [ptr_p, #192]
-    str pol0, [ptr_p], #4
-    subs cntr, #1
-    bne 1b
-  add ptr_p, #192      //(64-16)*4
-
-  movw cntr, #16
-  ldr.w zeta1, [ptr_zeta, #4]  //z14
-  ldr.w zeta2, [ptr_zeta, #8]  //z15
-  ldr zeta0, [ptr_zeta], #12  //z7
-  1:
-    ldr.w pol0, [ptr_p]  //16*4
-    ldr.w pol1, [ptr_p, #64]
-    ldr.w pol2, [ptr_p, #128]
-    ldr.w pol3, [ptr_p, #192]
-    ct_butterfly_montg pol0, pol2, zeta0, q, qinv, temp_h, temp_l  //stage3
-    ct_butterfly_montg pol1, pol3, zeta0, q, qinv, temp_h, temp_l  //stage3
-    ct_butterfly_montg pol0, pol1, zeta1, q, qinv, temp_h, temp_l  //stage4
-    ct_butterfly_montg pol2, pol3, zeta2, q, qinv, temp_h, temp_l  //stage4
-    str.w pol1, [ptr_p, #64]
-    str.w pol2, [ptr_p, #128]
-    str.w pol3, [ptr_p, #192] //(16*3-1)*4
-    str pol0, [ptr_p], #4
-    subs cntr, #1
-    bne 1b
-  sub ptr_p, #832      //(208)*4
-
-  //stage 5 and 6
-  movw cntr, #16
-  1:
-    ldr.w zeta1, [ptr_zeta, #4]  //z32, ..., z62
-    ldr.w zeta2, [ptr_zeta, #8]  //z33, ..., z63
-    ldr zeta0, [ptr_zeta], #12  //z16, ..., z31
-
-    ldr.w pol0, [ptr_p]  //4*4
-    ldr.w pol1, [ptr_p, #16]
-    ldr.w pol2, [ptr_p, #32]
-    ldr.w pol3, [ptr_p, #48]  //(4*3)*4
-    ct_butterfly_montg pol0, pol2, zeta0, q, qinv, temp_h, temp_l  //stage5
-    ct_butterfly_montg pol1, pol3, zeta0, q, qinv, temp_h, temp_l  //stage5
-    ct_butterfly_montg pol0, pol1, zeta1, q, qinv, temp_h, temp_l  //stage6
-    ct_butterfly_montg pol2, pol3, zeta2, q, qinv, temp_h, temp_l  //stage6
-
-    str.w pol1, [ptr_p, #16]
-    str.w pol2, [ptr_p, #32]
-    str.w pol3, [ptr_p, #48]  //(4*3-1)*4
-    str pol0, [ptr_p], #4
-
-    ldr.w pol0, [ptr_p]
-    ldr.w pol1, [ptr_p, #16]
-    ldr.w pol2, [ptr_p, #32]
-    ldr.w pol3, [ptr_p, #48]
-    ct_butterfly_montg pol0, pol2, zeta0, q, qinv, temp_h, temp_l  //stage5
-    ct_butterfly_montg pol1, pol3, zeta0, q, qinv, temp_h, temp_l  //stage5
-    ct_butterfly_montg pol0, pol1, zeta1, q, qinv, temp_h, temp_l  //stage6
-    ct_butterfly_montg pol2, pol3, zeta2, q, qinv, temp_h, temp_l  //stage6
-
-    str.w pol1, [ptr_p, #16]
-    str.w pol2, [ptr_p, #32]
-    str.w pol3, [ptr_p, #48]
-    str pol0, [ptr_p], #4
-
-    ldr.w pol0, [ptr_p] //4*4
-    ldr.w pol1, [ptr_p, #16]
-    ldr.w pol2, [ptr_p, #32]
-    ldr.w pol3, [ptr_p, #48]
-    ct_butterfly_montg pol0, pol2, zeta0, q, qinv, temp_h, temp_l  //stage5
-    ct_butterfly_montg pol1, pol3, zeta0, q, qinv, temp_h, temp_l  //stage5
-    ct_butterfly_montg pol0, pol1, zeta1, q, qinv, temp_h, temp_l  //stage6
-    ct_butterfly_montg pol2, pol3, zeta2, q, qinv, temp_h, temp_l  //stage6
-
-    str.w pol1, [ptr_p, #16]
-    str.w pol2, [ptr_p, #32]
-    str.w pol3, [ptr_p, #48] //(4*3-1)*4
-    str pol0, [ptr_p], #4
-
-    ldr.w pol0, [ptr_p]  //4*4
-    ldr.w pol1, [ptr_p, #16]
-    ldr.w pol2, [ptr_p, #32]
-    ldr.w pol3, [ptr_p, #48]  //(4*3)*4
-    ct_butterfly_montg pol0, pol2, zeta0, q, qinv, temp_h, temp_l  //stage5
-    ct_butterfly_montg pol1, pol3, zeta0, q, qinv, temp_h, temp_l  //stage5
-    ct_butterfly_montg pol0, pol1, zeta1, q, qinv, temp_h, temp_l  //stage6
-    ct_butterfly_montg pol2, pol3, zeta2, q, qinv, temp_h, temp_l  //stage6
-
-    str.w pol1, [ptr_p, #16]
-    str.w pol2, [ptr_p, #32]
-    str.w pol3, [ptr_p, #48]
-    str pol0, [ptr_p], #52
-
-    subs.w cntr, cntr, #1
-    bne 1b
-  sub ptr_p, #1024      //256*4
-
-  //stage 7 and 8
-  mov cntr, #64
-  1:
-
-    ldr.w zeta1, [ptr_zeta, #4]  //z128,..., z254
-    ldr.w zeta2, [ptr_zeta, #8]  //z129,..., z255
-    ldr zeta0, [ptr_zeta], #12  //z64, ..., z127
-    ldr.w pol0, [ptr_p]  //1*4
-    ldr.w pol1, [ptr_p, #4]
-    ldr.w pol2, [ptr_p, #8]
-    ldr.w pol3, [ptr_p, #12]  //3*4
-    ct_butterfly_montg pol0, pol2, zeta0, q, qinv, temp_h, temp_l  //stage7
-    ct_butterfly_montg pol1, pol3, zeta0, q, qinv, temp_h, temp_l  //stage7
-    ct_butterfly_montg pol0, pol1, zeta1, q, qinv, temp_h, temp_l  //stage8
-    ct_butterfly_montg pol2, pol3, zeta2, q, qinv, temp_h, temp_l  //stage8
-
-    str.w pol1, [ptr_p, #4]
-    str.w pol2, [ptr_p, #8]
-    str.w pol3, [ptr_p, #12]
-    str pol0, [ptr_p], #16
-    subs cntr, #1
-    bne 1b
-
-    //restore registers
-    pop {R4-R11, PC}
-
-    //unbind aliases
-    .unreq ptr_p
-    .unreq ptr_zeta
-    .unreq qinv
-    .unreq q
-    .unreq cntr
-    .unreq pol0
-    .unreq pol1
-    .unreq pol2
-    .unreq pol3
-    .unreq temp_h
-    .unreq temp_l
-    .unreq zeta0
-    .unreq zeta1
-    .unreq zeta2
-
-// ##############################
-// ##########  NTT^-1  ##########
-// ##############################
-
-//GS butterfly with Montgomery reduction  -- SIGNED
-.macro gs_butterfly_montg pol0, pol1, zeta, q, qinv, x, y
-  sub \x, \pol1, \pol0         // x= -(pol0 - pol1)
-  add.w \pol0, \pol0, \pol1
-  smull \y, \pol1, \x, \zeta   // -(pol0-pol1)*zeta -> (pol0-pol1)*(-zeta)
-  mul \x, \y, \qinv  //qinv is -qinv
-  smlal \y, \pol1, \x, \q
-.endm
-
-// Montgomery reduction -- SIGNED
-.macro montg_red f, pol, q, qinv, x, y
-  smull \y, \pol, \pol, \f
-  mul \x, \y, \qinv
-  smlal \y, \pol, \x, \q
-.endm
-
-//void pqcrystals_dilithium_invntt_tomont(int32_t p[N]);
-.global pqcrystals_dilithium_invntt_tomont
-.type pqcrystals_dilithium_invntt_tomont,%function
-.align 2
-pqcrystals_dilithium_invntt_tomont:
-  //bind aliases
-  ptr_p     .req R0
-  ptr_zeta  .req R1
-  qinv      .req R2
-  q         .req R3
-  cntr      .req R4
-  pol0      .req R5
-  pol1      .req R6
-  pol2      .req R7
-  pol3      .req R8
-  temp_h    .req R9
-  temp_l    .req R10
-  zeta0     .req R11
-  zeta1     .req R12
-  zeta2     .req R14
-
-  //preserve registers
-  push {R4-R11, R14}
-  //load constants, ptr
-  ldr.w qinv, inv_ntt_asm_smull_qinv  //-qinv_signed
-  ldr.w q  , inv_ntt_asm_smull_q
-  ldr ptr_zeta, =#zetas_interleaved_asm + 1020  // &zetas_interleaved_asm[N-1]
-
-  //stage 1 and 2
-  ldr.w cntr, inv_ntt_asm_smull_64
-  1:
-    ldr.w zeta1, [ptr_zeta, #-4]  //z1, ..., z127
-    ldr.w zeta2, [ptr_zeta, #-8]  //z128,.., z191
-    ldr zeta0, [ptr_zeta], #-12  //z0, ..., z126
-    ldr.w pol0, [ptr_p]  //1*4
-    ldr.w pol1, [ptr_p, #4]
-    ldr.w pol2, [ptr_p, #8]
-    ldr.w pol3, [ptr_p, #12]  //3*4
-    gs_butterfly_montg pol0, pol1, zeta0, q, qinv, temp_h, temp_l  //stage1
-    gs_butterfly_montg pol2, pol3, zeta1, q, qinv, temp_h, temp_l  //stage1
-    gs_butterfly_montg pol0, pol2, zeta2, q, qinv, temp_h, temp_l  //stage2
-    gs_butterfly_montg pol1, pol3, zeta2, q, qinv, temp_h, temp_l  //stage2
-    str.w pol1, [ptr_p, #4]
-    str.w pol2, [ptr_p, #8]
-    str.w pol3, [ptr_p, #12]
-    str pol0, [ptr_p], #16
-    subs cntr, #1
-    bne 1b
-  sub ptr_p, #1024      // on pol0 again
-
-  //stage 3 and 4
-  movw cntr, #16
-  1:
-    ldr.w zeta1, [ptr_zeta, #-4]
-    ldr.w zeta2, [ptr_zeta, #-8]
-    ldr zeta0, [ptr_zeta], #-12
-
-    ldr.w pol0, [ptr_p]
-    ldr.w pol1, [ptr_p, #16]
-    ldr.w pol2, [ptr_p, #32]
-    ldr.w pol3, [ptr_p, #48]
-    gs_butterfly_montg pol0, pol1, zeta0, q, qinv, temp_h, temp_l  //stage3
-    gs_butterfly_montg pol2, pol3, zeta1, q, qinv, temp_h, temp_l  //stage3
-    gs_butterfly_montg pol0, pol2, zeta2, q, qinv, temp_h, temp_l  //stage4
-    gs_butterfly_montg pol1, pol3, zeta2, q, qinv, temp_h, temp_l  //stage4
-
-    str.w pol1, [ptr_p, #16]
-    str.w pol2, [ptr_p, #32]
-    str.w pol3, [ptr_p, #48]
-    str pol0, [ptr_p], #4
-
-    ldr.w pol0, [ptr_p]
-    ldr.w pol1, [ptr_p, #16]
-    ldr.w pol2, [ptr_p, #32]
-    ldr.w pol3, [ptr_p, #48]
-    gs_butterfly_montg pol0, pol1, zeta0, q, qinv, temp_h, temp_l  //stage3
-    gs_butterfly_montg pol2, pol3, zeta1, q, qinv, temp_h, temp_l  //stage3
-    gs_butterfly_montg pol0, pol2, zeta2, q, qinv, temp_h, temp_l  //stage4
-    gs_butterfly_montg pol1, pol3, zeta2, q, qinv, temp_h, temp_l  //stage4
-
-    str.w pol1, [ptr_p, #16]
-    str.w pol2, [ptr_p, #32]
-    str.w pol3, [ptr_p, #48]
-    str pol0, [ptr_p], #4
-
-    ldr.w pol0, [ptr_p]
-    ldr.w pol1, [ptr_p, #16]
-    ldr.w pol2, [ptr_p, #32]
-    ldr.w pol3, [ptr_p, #48]
-    gs_butterfly_montg pol0, pol1, zeta0, q, qinv, temp_h, temp_l  //stage3
-    gs_butterfly_montg pol2, pol3, zeta1, q, qinv, temp_h, temp_l  //stage3
-    gs_butterfly_montg pol0, pol2, zeta2, q, qinv, temp_h, temp_l  //stage4
-    gs_butterfly_montg pol1, pol3, zeta2, q, qinv, temp_h, temp_l  //stage4
-
-    str.w pol1, [ptr_p, #16]
-    str.w pol2, [ptr_p, #32]
-    str.w pol3, [ptr_p, #48] //(4*3-1)*4
-    str pol0, [ptr_p], #4
-
-    ldr.w pol0, [ptr_p]
-    ldr.w pol1, [ptr_p, #16]
-    ldr.w pol2, [ptr_p, #32]
-    ldr.w pol3, [ptr_p, #48]
-    gs_butterfly_montg pol0, pol1, zeta0, q, qinv, temp_h, temp_l  //stage3
-    gs_butterfly_montg pol2, pol3, zeta1, q, qinv, temp_h, temp_l  //stage3
-    gs_butterfly_montg pol0, pol2, zeta2, q, qinv, temp_h, temp_l  //stage4
-    gs_butterfly_montg pol1, pol3, zeta2, q, qinv, temp_h, temp_l  //stage4
-    str.w pol1, [ptr_p, #16]
-    str.w pol2, [ptr_p, #32]
-    str.w pol3, [ptr_p, #48]
-    str pol0, [ptr_p], #52
-    subs.w cntr, cntr, #1
-    bne 1b
-  sub ptr_p, #1024      //256*4
-
-  //stage 5 and 6
-  movw cntr, #16
-  ldr.w zeta1, [ptr_zeta, #-4]
-  ldr.w zeta2, [ptr_zeta, #-8]
-  ldr zeta0, [ptr_zeta], #-12
-  1:
-    ldr.w pol0, [ptr_p]
-    ldr.w pol1, [ptr_p, #64]
-    ldr.w pol2, [ptr_p, #128]
-    ldr.w pol3, [ptr_p, #192]
-    gs_butterfly_montg pol0, pol1, zeta0, q, qinv, temp_h, temp_l  //stage5
-    gs_butterfly_montg pol2, pol3, zeta1, q, qinv, temp_h, temp_l  //stage5
-    gs_butterfly_montg pol0, pol2, zeta2, q, qinv, temp_h, temp_l  //stage6
-    gs_butterfly_montg pol1, pol3, zeta2, q, qinv, temp_h, temp_l  //stage6
-
-    str.w pol1, [ptr_p, #64]
-    str.w pol2, [ptr_p, #128]
-    str.w pol3, [ptr_p, #192] //(16*3-1)*4
-    str pol0, [ptr_p], #4
-    subs cntr, #1
-    bne 1b
-  add ptr_p, #192      //(64-16)*4
-
-  movw cntr, #16
-  ldr.w zeta1, [ptr_zeta, #-4]
-  ldr.w zeta2, [ptr_zeta, #-8]
-  ldr zeta0, [ptr_zeta], #-12
-  1:
-    ldr.w pol0, [ptr_p]
-    ldr.w pol1, [ptr_p, #64]
-    ldr.w pol2, [ptr_p, #128]
-    ldr.w pol3, [ptr_p, #192]
-    gs_butterfly_montg pol0, pol1, zeta0, q, qinv, temp_h, temp_l  //stage5
-    gs_butterfly_montg pol2, pol3, zeta1, q, qinv, temp_h, temp_l  //stage5
-    gs_butterfly_montg pol0, pol2, zeta2, q, qinv, temp_h, temp_l  //stage6
-    gs_butterfly_montg pol1, pol3, zeta2, q, qinv, temp_h, temp_l  //stage6
-    str.w pol1, [ptr_p, #64]
-    str.w pol2, [ptr_p, #128]
-    str.w pol3, [ptr_p, #192]
-    str pol0, [ptr_p], #4
-    subs cntr, #1
-    bne 1b
-  add.w ptr_p, ptr_p, #192      //(64-16)*4
-
-  movw cntr, #16
-  ldr.w zeta1, [ptr_zeta, #-4]
-  ldr.w zeta2, [ptr_zeta, #-8]
-  ldr zeta0, [ptr_zeta], #-12
-  1:
-    ldr.w pol0, [ptr_p]
-    ldr.w pol1, [ptr_p, #64]
-    ldr.w pol2, [ptr_p, #128]
-    ldr.w pol3, [ptr_p, #192]
-    gs_butterfly_montg pol0, pol1, zeta0, q, qinv, temp_h, temp_l  //stage5
-    gs_butterfly_montg pol2, pol3, zeta1, q, qinv, temp_h, temp_l  //stage5
-    gs_butterfly_montg pol0, pol2, zeta2, q, qinv, temp_h, temp_l  //stage6
-    gs_butterfly_montg pol1, pol3, zeta2, q, qinv, temp_h, temp_l  //stage6
-    str.w pol1, [ptr_p, #64]
-    str.w pol2, [ptr_p, #128]
-    str.w pol3, [ptr_p, #192]
-    str pol0, [ptr_p], #4
-    subs cntr, #1
-    bne 1b
-  add ptr_p, #192      //(64-16)*4
-
-  movw cntr, #16
-  ldr.w zeta1, [ptr_zeta, #-4]
-  ldr.w zeta2, [ptr_zeta, #-8]
-  ldr zeta0, [ptr_zeta], #-12
-  1:
-    ldr.w pol0, [ptr_p]
-    ldr.w pol1, [ptr_p, #64]
-    ldr.w pol2, [ptr_p, #128]
-    ldr.w pol3, [ptr_p, #192]
-    gs_butterfly_montg pol0, pol1, zeta0, q, qinv, temp_h, temp_l  //stage5
-    gs_butterfly_montg pol2, pol3, zeta1, q, qinv, temp_h, temp_l  //stage5
-    gs_butterfly_montg pol0, pol2, zeta2, q, qinv, temp_h, temp_l  //stage6
-    gs_butterfly_montg pol1, pol3, zeta2, q, qinv, temp_h, temp_l  //stage6
-    str.w pol1, [ptr_p, #64]
-    str.w pol2, [ptr_p, #128]
-    str.w pol3, [ptr_p, #192]
-    str pol0, [ptr_p], #4
-    subs cntr, #1
-    bne 1b
-  sub ptr_p, #832      //(208)*4
-
-  //stage 7 and 8
-  movw cntr, #64
-  ldr.w zeta0, [ptr_zeta]
-  ldr.w zeta1, [ptr_zeta, #-4]
-  ldr.w zeta2, [ptr_zeta, #-12] // zetas_interleaved_asm[0] = 4404704
-  ldr.w ptr_zeta, inv_ntt_asm_smull_f //ptr_zeta not needed anymore, contains now value f for final reduction
-  1:
-    ldr.w pol0, [ptr_p]
-    ldr.w pol1, [ptr_p, #256]  //64*4
-    ldr.w pol2, [ptr_p, #512]  //128*4
-    ldr.w pol3, [ptr_p, #768]  //192*4
-    gs_butterfly_montg pol0, pol1, zeta0, q, qinv, temp_h, temp_l  //stage7
-    gs_butterfly_montg pol2, pol3, zeta1, q, qinv, temp_h, temp_l  //stage7
-    gs_butterfly_montg pol0, pol2, zeta2, q, qinv, temp_h, temp_l  //stage8
-    gs_butterfly_montg pol1, pol3, zeta2, q, qinv, temp_h, temp_l  //stage8
-    montg_red ptr_zeta, pol0, q, qinv, temp_h, temp_l        //final reduction
-    montg_red ptr_zeta, pol1, q, qinv, temp_h, temp_l        //final reduction
-
-    // We can save the multiplication by f here by instead pre-computing it and putting it into the
-    // twiddle factor of the previous butterfly
-    // zeta2 is 4404704 instead of 25847 (4404704 = (25847 * 16382) % Q)
-
-    //montg_red ptr_zeta, pol2, q, qinv, temp_h, temp_l        //final reduction
-    //montg_red ptr_zeta, pol3, q, qinv, temp_h, temp_l        //final reduction
-    str.w pol1, [ptr_p, #256]
-    str.w pol2, [ptr_p, #512]
-    str.w pol3, [ptr_p, #768]
-    str pol0, [ptr_p], #4
-    subs cntr, #1
-    bne 1b
-
-    //restore registers
-    pop {R4-R11, PC}
-
-    //unbind aliases
-    .unreq ptr_p
-    .unreq ptr_zeta
-    .unreq qinv
-    .unreq q
-    .unreq cntr
-    .unreq pol0
-    .unreq pol1
-    .unreq pol2
-    .unreq pol3
-    .unreq temp_h
-    .unreq temp_l
-    .unreq zeta0
-    .unreq zeta1
-    .unreq zeta2
-
-.align 2
-inv_ntt_asm_smull_f:
-.word 41978
-.align 2
-inv_ntt_asm_smull_qinv:
-.word 0xfc7fdfff
-.align 2
-inv_ntt_asm_smull_q:
-.word 8380417
-.align 2
-inv_ntt_asm_smull_64:
-.word 64
-
-
-.section .rodata
-/* Roots of unity in order needed by forward ntt */
-.type zetas_interleaved_asm, %object
-.align 2
-zetas_interleaved_asm:
-.word 4404704, 25847, -2608894, -518909, 237124, 1826347, 2353451, -777960, -359251, -2091905, -876248, 3119733, -2884855, 466468, 3111497, 2680103, 2725464, 2706023, 95776, 1024112, 3077325, 3530437, -1079900, -1661693, -3592148, 3585928, -2537516, 3915439, -549488, -3861115, -3043716, -1119584, 3574422, -2867647, 2619752, 3539968, -300467, -2108549, 2348700, -539299, -2118186, -1699267, -1643818, -3859737, 3505694, -3821735, -1399561, 3507263, -2140649, -3277672, -1600420, 3699596, 1757237, 811944, 531354, -19422, 954230, 3881043, 4010497, 3900724, -2556880, 280005, 2071892, -2797779, -3930395, 2091667, 3407706, -1528703, 2316500, 3817976, -3677745, -3342478, 2244091, -3041255, -2446433, -3562462, -1452451, 266997, 2434439, 3475950, -1235728, 3513181, 2176455, -3520352, -3759364, -1585221, -1197226, -3193378, -1257611, 900702, 1859098, 1939314, 909542, 819034, -4083598, 495491, -1613174, -1000202, -43260, -522500, -3190144, -655327, -3122442, -3157330, 2031748, 3207046, -3632928, -3556995, -525098, 126922, -768622, -3595838, 3412210, 342297, 286988, -983419, -2437823, 4108315, 2147896, 3437287, -3342277, 2715295, 1735879, 203044, -2967645, 2842341, 2691481, -3693493, -2590150, 1265009, -411027, 4055324, 1247620, -2477047, 2486353, 1595974, -671102, -3767016, 1250494, -1228525, 2635921, -3548272, -22981, -2994039, 1869119, -1308169, 1903435, -1050970, -381987, -1333058, 1237275, 1349076, -3318210, -1430225, 1852771, -451100, 1312455, -1430430, 3306115, -1962642, -3343383, -1279661, 1917081, 264944, -2546312, -1374803, 508951, 1500165, 777191, 3097992, 2235880, 3406031, 44288, -542412, -2831860, -1100098, -1671176, -1846953, 904516, -2584293, -3724270, 3958618, 594136, -3776993, -3724342, -2013608, 2432395, -8578, 2454455, -164721, 1653064, 1957272, 3369112, -3249728, 185531, -1207385, 2389356, -3183426, 162844, -210977, 1616392, 3014001, 759969, 810149, 1652634, -1316856, -3694233, -1799107, 189548, -3038916, 3523897, -3553272, 3866901, 269760, 3159746, 2213111, -975884, -1851402, 1717735, 472078, -2409325, -426683, 1723600, -177440, -1803090, 1910376, 1315589, -1667432, -1104333, 1341330, -260646, -3833893, 1285669, -2939036, -2235985, -1584928, -420899, -2286327, -812732, 183443, -976891, -1439742, 1612842, -3545687, -3019102, -554416, 3919660, -3881060, -48306, -1362209, -3628969, 3937738, 1400424, 3839961, -846154, 1976782
-.size zetas_interleaved_asm,.-zetas_interleaved_asm
diff --git a/src/dilithium/ntt.h b/src/dilithium/ntt.h
deleted file mode 100644
index 731132d..0000000
--- a/src/dilithium/ntt.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef NTT_H
-#define NTT_H
-
-#include <stdint.h>
-#include "params.h"
-
-#define ntt DILITHIUM_NAMESPACE(ntt)
-void ntt(int32_t a[N]);
-
-#define invntt_tomont DILITHIUM_NAMESPACE(invntt_tomont)
-void invntt_tomont(int32_t a[N]);
-
-#endif
diff --git a/src/dilithium/packing.c b/src/dilithium/packing.c
deleted file mode 100644
index ae463d1..0000000
--- a/src/dilithium/packing.c
+++ /dev/null
@@ -1,237 +0,0 @@
-#include "params.h"
-#include "packing.h"
-#include "polyvec.h"
-#include "poly.h"
-
-/*************************************************
-* Name:        pack_pk
-*
-* Description: Bit-pack public key pk = (rho, t1).
-*
-* Arguments:   - uint8_t pk[]: output byte array
-*              - const uint8_t rho[]: byte array containing rho
-*              - const polyveck *t1: pointer to vector t1
-**************************************************/
-void pack_pk(uint8_t pk[CRYPTO_PUBLICKEYBYTES],
-             const uint8_t rho[SEEDBYTES],
-             const polyveck *t1)
-{
-  unsigned int i;
-
-  for(i = 0; i < SEEDBYTES; ++i)
-    pk[i] = rho[i];
-  pk += SEEDBYTES;
-
-  for(i = 0; i < K; ++i)
-    polyt1_pack(pk + i*POLYT1_PACKEDBYTES, &t1->vec[i]);
-}
-
-/*************************************************
-* Name:        unpack_pk
-*
-* Description: Unpack public key pk = (rho, t1).
-*
-* Arguments:   - const uint8_t rho[]: output byte array for rho
-*              - const polyveck *t1: pointer to output vector t1
-*              - uint8_t pk[]: byte array containing bit-packed pk
-**************************************************/
-void unpack_pk(uint8_t rho[SEEDBYTES],
-               polyveck *t1,
-               const uint8_t pk[CRYPTO_PUBLICKEYBYTES])
-{
-  unsigned int i;
-
-  for(i = 0; i < SEEDBYTES; ++i)
-    rho[i] = pk[i];
-  pk += SEEDBYTES;
-
-  for(i = 0; i < K; ++i)
-    polyt1_unpack(&t1->vec[i], pk + i*POLYT1_PACKEDBYTES);
-}
-
-/*************************************************
-* Name:        pack_sk
-*
-* Description: Bit-pack secret key sk = (rho, tr, key, t0, s1, s2).
-*
-* Arguments:   - uint8_t sk[]: output byte array
-*              - const uint8_t rho[]: byte array containing rho
-*              - const uint8_t tr[]: byte array containing tr
-*              - const uint8_t key[]: byte array containing key
-*              - const polyveck *t0: pointer to vector t0
-*              - const polyvecl *s1: pointer to vector s1
-*              - const polyveck *s2: pointer to vector s2
-**************************************************/
-void pack_sk(uint8_t sk[CRYPTO_SECRETKEYBYTES],
-             const uint8_t rho[SEEDBYTES],
-             const uint8_t tr[CRHBYTES],
-             const uint8_t key[SEEDBYTES],
-             const polyveck *t0,
-             const polyvecl *s1,
-             const polyveck *s2)
-{
-  unsigned int i;
-
-  for(i = 0; i < SEEDBYTES; ++i)
-    sk[i] = rho[i];
-  sk += SEEDBYTES;
-
-  for(i = 0; i < SEEDBYTES; ++i)
-    sk[i] = key[i];
-  sk += SEEDBYTES;
-
-  for(i = 0; i < CRHBYTES; ++i)
-    sk[i] = tr[i];
-  sk += CRHBYTES;
-
-  for(i = 0; i < L; ++i)
-    polyeta_pack(sk + i*POLYETA_PACKEDBYTES, &s1->vec[i]);
-  sk += L*POLYETA_PACKEDBYTES;
-
-  for(i = 0; i < K; ++i)
-    polyeta_pack(sk + i*POLYETA_PACKEDBYTES, &s2->vec[i]);
-  sk += K*POLYETA_PACKEDBYTES;
-
-  for(i = 0; i < K; ++i)
-    polyt0_pack(sk + i*POLYT0_PACKEDBYTES, &t0->vec[i]);
-}
-
-/*************************************************
-* Name:        unpack_sk
-*
-* Description: Unpack secret key sk = (rho, tr, key, t0, s1, s2).
-*
-* Arguments:   - const uint8_t rho[]: output byte array for rho
-*              - const uint8_t tr[]: output byte array for tr
-*              - const uint8_t key[]: output byte array for key
-*              - const polyveck *t0: pointer to output vector t0
-*              - const polyvecl *s1: pointer to output vector s1
-*              - const polyveck *s2: pointer to output vector s2
-*              - uint8_t sk[]: byte array containing bit-packed sk
-**************************************************/
-void unpack_sk(uint8_t rho[SEEDBYTES],
-               uint8_t tr[CRHBYTES],
-               uint8_t key[SEEDBYTES],
-               polyveck *t0,
-               polyvecl *s1,
-               polyveck *s2,
-               const uint8_t sk[CRYPTO_SECRETKEYBYTES])
-{
-  unsigned int i;
-
-  for(i = 0; i < SEEDBYTES; ++i)
-    rho[i] = sk[i];
-  sk += SEEDBYTES;
-
-  for(i = 0; i < SEEDBYTES; ++i)
-    key[i] = sk[i];
-  sk += SEEDBYTES;
-
-  for(i = 0; i < CRHBYTES; ++i)
-    tr[i] = sk[i];
-  sk += CRHBYTES;
-
-  for(i=0; i < L; ++i)
-    polyeta_unpack(&s1->vec[i], sk + i*POLYETA_PACKEDBYTES);
-  sk += L*POLYETA_PACKEDBYTES;
-
-  for(i=0; i < K; ++i)
-    polyeta_unpack(&s2->vec[i], sk + i*POLYETA_PACKEDBYTES);
-  sk += K*POLYETA_PACKEDBYTES;
-
-  for(i=0; i < K; ++i)
-    polyt0_unpack(&t0->vec[i], sk + i*POLYT0_PACKEDBYTES);
-}
-
-/*************************************************
-* Name:        pack_sig
-*
-* Description: Bit-pack signature sig = (c, z, h).
-*
-* Arguments:   - uint8_t sig[]: output byte array
-*              - const uint8_t *c: pointer to challenge hash length SEEDBYTES
-*              - const polyvecl *z: pointer to vector z
-*              - const polyveck *h: pointer to hint vector h
-**************************************************/
-void pack_sig(uint8_t sig[CRYPTO_BYTES],
-              const uint8_t c[SEEDBYTES],
-              const polyvecl *z,
-              const polyveck *h)
-{
-  unsigned int i, j, k;
-
-  for(i=0; i < SEEDBYTES; ++i)
-    sig[i] = c[i];
-  sig += SEEDBYTES;
-
-  for(i = 0; i < L; ++i)
-    polyz_pack(sig + i*POLYZ_PACKEDBYTES, &z->vec[i]);
-  sig += L*POLYZ_PACKEDBYTES;
-
-  /* Encode h */
-  for(i = 0; i < OMEGA + K; ++i)
-    sig[i] = 0;
-
-  k = 0;
-  for(i = 0; i < K; ++i) {
-    for(j = 0; j < N; ++j)
-      if(h->vec[i].coeffs[j] != 0)
-        sig[k++] = j;
-
-    sig[OMEGA + i] = k;
-  }
-}
-
-/*************************************************
-* Name:        unpack_sig
-*
-* Description: Unpack signature sig = (c, z, h).
-*
-* Arguments:   - uint8_t *c: pointer to output challenge hash
-*              - polyvecl *z: pointer to output vector z
-*              - polyveck *h: pointer to output hint vector h
-*              - const uint8_t sig[]: byte array containing
-*                bit-packed signature
-*
-* Returns 1 in case of malformed signature; otherwise 0.
-**************************************************/
-int unpack_sig(uint8_t c[SEEDBYTES],
-               polyvecl *z,
-               polyveck *h,
-               const uint8_t sig[CRYPTO_BYTES])
-{
-  unsigned int i, j, k;
-
-  for(i = 0; i < SEEDBYTES; ++i)
-    c[i] = sig[i];
-  sig += SEEDBYTES;
-
-  for(i = 0; i < L; ++i)
-    polyz_unpack(&z->vec[i], sig + i*POLYZ_PACKEDBYTES);
-  sig += L*POLYZ_PACKEDBYTES;
-
-  /* Decode h */
-  k = 0;
-  for(i = 0; i < K; ++i) {
-    for(j = 0; j < N; ++j)
-      h->vec[i].coeffs[j] = 0;
-
-    if(sig[OMEGA + i] < k || sig[OMEGA + i] > OMEGA)
-      return 1;
-
-    for(j = k; j < sig[OMEGA + i]; ++j) {
-      /* Coefficients are ordered for strong unforgeability */
-      if(j > k && sig[j] <= sig[j-1]) return 1;
-      h->vec[i].coeffs[sig[j]] = 1;
-    }
-
-    k = sig[OMEGA + i];
-  }
-
-  /* Extra indices are zero for strong unforgeability */
-  for(j = k; j < OMEGA; ++j)
-    if(sig[j])
-      return 1;
-
-  return 0;
-}
diff --git a/src/dilithium/packing.h b/src/dilithium/packing.h
deleted file mode 100644
index be75aaa..0000000
--- a/src/dilithium/packing.h
+++ /dev/null
@@ -1,38 +0,0 @@
-#ifndef PACKING_H
-#define PACKING_H
-
-#include <stdint.h>
-#include "params.h"
-#include "polyvec.h"
-
-#define pack_pk DILITHIUM_NAMESPACE(pack_pk)
-void pack_pk(uint8_t pk[CRYPTO_PUBLICKEYBYTES], const uint8_t rho[SEEDBYTES], const polyveck *t1);
-
-#define pack_sk DILITHIUM_NAMESPACE(pack_sk)
-void pack_sk(uint8_t sk[CRYPTO_SECRETKEYBYTES],
-             const uint8_t rho[SEEDBYTES],
-             const uint8_t tr[CRHBYTES],
-             const uint8_t key[SEEDBYTES],
-             const polyveck *t0,
-             const polyvecl *s1,
-             const polyveck *s2);
-
-#define pack_sig DILITHIUM_NAMESPACE(pack_sig)
-void pack_sig(uint8_t sig[CRYPTO_BYTES], const uint8_t c[SEEDBYTES], const polyvecl *z, const polyveck *h);
-
-#define unpack_pk DILITHIUM_NAMESPACE(unpack_pk)
-void unpack_pk(uint8_t rho[SEEDBYTES], polyveck *t1, const uint8_t pk[CRYPTO_PUBLICKEYBYTES]);
-
-#define unpack_sk DILITHIUM_NAMESPACE(unpack_sk)
-void unpack_sk(uint8_t rho[SEEDBYTES],
-               uint8_t tr[CRHBYTES],
-               uint8_t key[SEEDBYTES],
-               polyveck *t0,
-               polyvecl *s1,
-               polyveck *s2,
-               const uint8_t sk[CRYPTO_SECRETKEYBYTES]);
-
-#define unpack_sig DILITHIUM_NAMESPACE(unpack_sig)
-int unpack_sig(uint8_t c[SEEDBYTES], polyvecl *z, polyveck *h, const uint8_t sig[CRYPTO_BYTES]);
-
-#endif
diff --git a/src/dilithium/params.h b/src/dilithium/params.h
deleted file mode 100644
index db3eb21..0000000
--- a/src/dilithium/params.h
+++ /dev/null
@@ -1,80 +0,0 @@
-#ifndef PARAMS_H
-#define PARAMS_H
-
-#include "config.h"
-
-#define DILITHIUM_NAMESPACE(s) pqcrystals_dilithium_##s
-
-
-#define SEEDBYTES 32
-#define CRHBYTES 48
-#define N 256
-#define Q 8380417
-#define D 13
-#define ROOT_OF_UNITY 1753
-
-#if DILITHIUM_MODE == 2
-#define K 4
-#define L 4
-#define ETA 2
-#define TAU 39
-#define BETA 78
-#define GAMMA1 (1 << 17)
-#define GAMMA2 ((Q-1)/88)
-#define OMEGA 80
-#define CRYPTO_ALGNAME "Dilithium2"
-
-#elif DILITHIUM_MODE == 3
-#define K 6
-#define L 5
-#define ETA 4
-#define TAU 49
-#define BETA 196
-#define GAMMA1 (1 << 19)
-#define GAMMA2 ((Q-1)/32)
-#define OMEGA 55
-#define CRYPTO_ALGNAME "Dilithium3"
-
-#elif DILITHIUM_MODE == 5
-#define K 8
-#define L 7
-#define ETA 2
-#define TAU 60
-#define BETA 120
-#define GAMMA1 (1 << 19)
-#define GAMMA2 ((Q-1)/32)
-#define OMEGA 75
-#define CRYPTO_ALGNAME "Dilithium5"
-
-#endif
-
-#define POLYT1_PACKEDBYTES  320
-#define POLYT0_PACKEDBYTES  416
-#define POLYVECH_PACKEDBYTES (OMEGA + K)
-
-#if GAMMA1 == (1 << 17)
-#define POLYZ_PACKEDBYTES   576
-#elif GAMMA1 == (1 << 19)
-#define POLYZ_PACKEDBYTES   640
-#endif
-
-#if GAMMA2 == (Q-1)/88
-#define POLYW1_PACKEDBYTES  192
-#elif GAMMA2 == (Q-1)/32
-#define POLYW1_PACKEDBYTES  128
-#endif
-
-#if ETA == 2
-#define POLYETA_PACKEDBYTES  96
-#elif ETA == 4
-#define POLYETA_PACKEDBYTES 128
-#endif
-
-#define CRYPTO_PUBLICKEYBYTES (SEEDBYTES + K*POLYT1_PACKEDBYTES)
-#define CRYPTO_SECRETKEYBYTES (2*SEEDBYTES + CRHBYTES \
-                               + L*POLYETA_PACKEDBYTES \
-                               + K*POLYETA_PACKEDBYTES \
-                               + K*POLYT0_PACKEDBYTES)
-#define CRYPTO_BYTES (SEEDBYTES + L*POLYZ_PACKEDBYTES + POLYVECH_PACKEDBYTES)
-
-#endif
diff --git a/src/dilithium/pointwise_mont.h b/src/dilithium/pointwise_mont.h
deleted file mode 100644
index 2647a11..0000000
--- a/src/dilithium/pointwise_mont.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef POINTWISE_MONT_H
-#define POINTWISE_MONT_H
-
-#include <stdint.h>
-#include "params.h"
-
-
-#define asm_pointwise_montgomery DILITHIUM_NAMESPACE(asm_pointwise_montgomery)
-void asm_pointwise_montgomery(int32_t c[N], const int32_t a[N], const int32_t b[N]);
-#define asm_pointwise_acc_montgomery DILITHIUM_NAMESPACE(asm_pointwise_acc_montgomery)
-void asm_pointwise_acc_montgomery(int32_t c[N], const int32_t a[N], const int32_t b[N]);
-
-#endif
diff --git a/src/dilithium/pointwise_mont.s b/src/dilithium/pointwise_mont.s
deleted file mode 100644
index e21125d..0000000
--- a/src/dilithium/pointwise_mont.s
+++ /dev/null
@@ -1,128 +0,0 @@
-.syntax unified
-.thumb
-
-.macro montgomery_multiplication res, pa, pb, q, qinv
-    smull \pa, \res, \pa, \pb
-    mul \pb, \pa, \qinv
-    smlal \pa, \res, \pb, \q
-.endm
-
-
-// void asm_pointwise_montgomery(int32_t c[N], const int32_t a[N], const int32_t b[N]);
-.global pqcrystals_dilithium_asm_pointwise_montgomery
-.type pqcrystals_dilithium_asm_pointwise_montgomery,%function
-.align 2
-pqcrystals_dilithium_asm_pointwise_montgomery:
-    push.w {r4-r11, r14}
-    c_ptr .req r0
-    a_ptr .req r1
-    b_ptr .req r2
-    qinv  .req r3
-    q     .req r4
-    pa0   .req r5
-    pa1   .req r6
-    pa2   .req r7
-    pb0   .req r8
-    pb1   .req r9
-    pb2   .req r10
-    tmp0  .req r11
-    ctr   .req r12
-    res   .req r14
-
-    movw qinv, #:lower16:0xfc7fdfff
-    movt qinv, #:upper16:0xfc7fdfff
-    movw q, #0xE001
-    movt q, #0x7F
-
-
-    // 85x3 = 255 coefficients
-    movw ctr, #85
-    1:
-        ldr.w pa1, [a_ptr, #4]
-        ldr.w pa2, [a_ptr, #8]
-        ldr pa0, [a_ptr], #12
-        ldr.w pb1, [b_ptr, #4]
-        ldr.w pb2, [b_ptr, #8]
-        ldr pb0, [b_ptr], #12
-
-        montgomery_multiplication res, pa0, pb0, q, qinv
-        str res, [c_ptr], #4
-        montgomery_multiplication res, pa1, pb1, q, qinv
-        str res, [c_ptr], #4
-        montgomery_multiplication res, pa2, pb2, q, qinv
-        str res, [c_ptr], #4
-    subs ctr, #1
-    bne.w 1b
-
-    // final coefficient
-    ldr.w pa0, [a_ptr]
-    ldr.w pb0, [b_ptr]
-    montgomery_multiplication res, pa0, pb0, q, qinv
-    str.w res, [c_ptr]
-
-    pop.w {r4-r11, pc}
-.size pqcrystals_dilithium_asm_pointwise_montgomery, .-pqcrystals_dilithium_asm_pointwise_montgomery
-
-// void asm_pointwise_acc_montgomery(int32_t c[N], const int32_t a[N], const int32_t b[N]);
-.global pqcrystals_dilithium_asm_pointwise_acc_montgomery
-.type pqcrystals_dilithium_asm_pointwise_acc_montgomery,%function
-.align 2
-pqcrystals_dilithium_asm_pointwise_acc_montgomery:
-    push.w {r4-r11, r14}
-    c_ptr .req r0
-    a_ptr .req r1
-    b_ptr .req r2
-    qinv  .req r3
-    q     .req r4
-    pa0   .req r5
-    pa1   .req r6
-    pa2   .req r7
-    pb0   .req r8
-    pb1   .req r9
-    pb2   .req r10
-    tmp0  .req r11
-    ctr   .req r12
-    res   .req r14
-
-    movw qinv, #:lower16:0xfc7fdfff
-    movt qinv, #:upper16:0xfc7fdfff
-    movw q, #0xE001
-    movt q, #0x7F
-
-
-    // 85x3 = 255 coefficients
-    movw ctr, #85
-    1:
-        ldr.w pa1, [a_ptr, #4]
-        ldr.w pa2, [a_ptr, #8]
-        ldr pa0, [a_ptr], #12
-        ldr.w pb1, [b_ptr, #4]
-        ldr.w pb2, [b_ptr, #8]
-        ldr pb0, [b_ptr], #12
-
-        montgomery_multiplication res, pa0, pb0, q, qinv
-        montgomery_multiplication pa0, pa1, pb1, q, qinv
-        montgomery_multiplication pa1, pa2, pb2, q, qinv
-
-        ldr.w pb0, [c_ptr]
-        ldr.w pb1, [c_ptr, #4]
-        ldr.w pb2, [c_ptr, #8]
-        add.w res, res, pb0
-        str res, [c_ptr], #12
-        add.w pa0, pa0, pb1
-        str pa0, [c_ptr, #-8]
-        add.w pa1, pa1, pb2
-        str pa1, [c_ptr, #-4]
-    subs ctr, #1
-    bne.w 1b
-
-    // final coefficient
-    ldr.w pa0, [a_ptr]
-    ldr.w pb0, [b_ptr]
-    ldr.w pa1, [c_ptr]
-    montgomery_multiplication res, pa0, pb0, q, qinv
-    add.w res, res, pa1
-    str.w res, [c_ptr]
-
-    pop.w {r4-r11, pc}
-.size pqcrystals_dilithium_asm_pointwise_acc_montgomery, .-pqcrystals_dilithium_asm_pointwise_acc_montgomery
diff --git a/src/dilithium/poly.c b/src/dilithium/poly.c
deleted file mode 100644
index 85f94f1..0000000
--- a/src/dilithium/poly.c
+++ /dev/null
@@ -1,889 +0,0 @@
-#include <stdint.h>
-#include "params.h"
-#include "poly.h"
-#include "vector.h"
-#include "ntt.h"
-#include "pointwise_mont.h"
-#include "rounding.h"
-#include "symmetric.h"
-
-#ifdef DBENCH
-#include "test/cpucycles.h"
-extern const uint64_t timing_overhead;
-extern uint64_t *tred, *tadd, *tmul, *tround, *tsample, *tpack;
-#define DBENCH_START() uint64_t time = cpucycles()
-#define DBENCH_STOP(t) t += cpucycles() - time - timing_overhead
-#else
-#define DBENCH_START()
-#define DBENCH_STOP(t)
-#endif
-
-/*************************************************
-* Name:        poly_reduce
-*
-* Description: Inplace reduction of all coefficients of polynomial to
-*              representative in [-6283009,6283007].
-*
-* Arguments:   - poly *a: pointer to input/output polynomial
-**************************************************/
-void poly_reduce(poly *a) {
-  asm_reduce32(a->coeffs);
-}
-
-/*************************************************
-* Name:        poly_caddq
-*
-* Description: For all coefficients of in/out polynomial add Q if
-*              coefficient is negative.
-*
-* Arguments:   - poly *a: pointer to input/output polynomial
-**************************************************/
-void poly_caddq(poly *a) {
-  asm_caddq(a->coeffs);
-}
-
-#if 0
-/*************************************************
-* Name:        poly_freeze
-*
-* Description: Inplace reduction of all coefficients of polynomial to
-*              standard representatives.
-*
-* Arguments:   - poly *a: pointer to input/output polynomial
-**************************************************/
-void poly_freeze(poly *a) {
-    asm_freeze(a->coeffs);
-}
-#endif
-
-/*************************************************
-* Name:        poly_add
-*
-* Description: Add polynomials. No modular reduction is performed.
-*
-* Arguments:   - poly *c: pointer to output polynomial
-*              - const poly *a: pointer to first summand
-*              - const poly *b: pointer to second summand
-**************************************************/
-void poly_add(poly *c, const poly *a, const poly *b)  {
-  unsigned int i;
-  DBENCH_START();
-
-  for(i = 0; i < N; ++i)
-    c->coeffs[i] = a->coeffs[i] + b->coeffs[i];
-
-  DBENCH_STOP(*tadd);
-}
-
-/*************************************************
-* Name:        poly_sub
-*
-* Description: Subtract polynomials. No modular reduction is
-*              performed.
-*
-* Arguments:   - poly *c: pointer to output polynomial
-*              - const poly *a: pointer to first input polynomial
-*              - const poly *b: pointer to second input polynomial to be
-*                               subtraced from first input polynomial
-**************************************************/
-void poly_sub(poly *c, const poly *a, const poly *b) {
-  unsigned int i;
-  DBENCH_START();
-
-  for(i = 0; i < N; ++i)
-    c->coeffs[i] = a->coeffs[i] - b->coeffs[i];
-
-  DBENCH_STOP(*tadd);
-}
-
-/*************************************************
-* Name:        poly_shiftl
-*
-* Description: Multiply polynomial by 2^D without modular reduction. Assumes
-*              input coefficients to be less than 2^{31-D} in absolute value.
-*
-* Arguments:   - poly *a: pointer to input/output polynomial
-**************************************************/
-void poly_shiftl(poly *a) {
-  unsigned int i;
-  DBENCH_START();
-
-  for(i = 0; i < N; ++i)
-    a->coeffs[i] <<= D;
-
-  DBENCH_STOP(*tmul);
-}
-
-/*************************************************
-* Name:        poly_ntt
-*
-* Description: Inplace forward NTT. Coefficients can grow by
-*              8*Q in absolute value.
-*
-* Arguments:   - poly *a: pointer to input/output polynomial
-**************************************************/
-void poly_ntt(poly *a) {
-  DBENCH_START();
-
-  ntt(a->coeffs);
-
-  DBENCH_STOP(*tmul);
-}
-
-/*************************************************
-* Name:        poly_invntt_tomont
-*
-* Description: Inplace inverse NTT and multiplication by 2^{32}.
-*              Input coefficients need to be less than Q in absolute
-*              value and output coefficients are again bounded by Q.
-*
-* Arguments:   - poly *a: pointer to input/output polynomial
-**************************************************/
-void poly_invntt_tomont(poly *a) {
-  DBENCH_START();
-
-  invntt_tomont(a->coeffs);
-
-  DBENCH_STOP(*tmul);
-}
-
-/*************************************************
-* Name:        poly_pointwise_montgomery
-*
-* Description: Pointwise multiplication of polynomials in NTT domain
-*              representation and multiplication of resulting polynomial
-*              by 2^{-32}.
-*
-* Arguments:   - poly *c: pointer to output polynomial
-*              - const poly *a: pointer to first input polynomial
-*              - const poly *b: pointer to second input polynomial
-**************************************************/
-void poly_pointwise_montgomery(poly *c, const poly *a, const poly *b) {
-  DBENCH_START();
-
-  asm_pointwise_montgomery(c->coeffs, a->coeffs, b->coeffs);
-
-  DBENCH_STOP(*tmul);
-}
-
-/*************************************************
-* Name:        poly_pointwise_acc_montgomery
-*
-* Description: Pointwise multiplication of polynomials in NTT domain
-*              representation, multiplication of resulting polynomial
-*              by 2^{-32} and accumulate.
-*
-* Arguments:   - poly *c: pointer to output (accumulating) polynomial
-*              - const poly *a: pointer to first input polynomial
-*              - const poly *b: pointer to second input polynomial
-**************************************************/
-void poly_pointwise_acc_montgomery(poly *c, const poly *a, const poly *b) {
-  DBENCH_START();
-
-  asm_pointwise_acc_montgomery(c->coeffs, a->coeffs, b->coeffs);
-
-  DBENCH_STOP(*tmul);
-}
-
-/*************************************************
-* Name:        poly_power2round
-*
-* Description: For all coefficients c of the input polynomial,
-*              compute c0, c1 such that c mod Q = c1*2^D + c0
-*              with -2^{D-1} < c0 <= 2^{D-1}. Assumes coefficients to be
-*              standard representatives.
-*
-* Arguments:   - poly *a1: pointer to output polynomial with coefficients c1
-*              - poly *a0: pointer to output polynomial with coefficients c0
-*              - const poly *a: pointer to input polynomial
-**************************************************/
-void poly_power2round(poly *a1, poly *a0, const poly *a) {
-  unsigned int i;
-  DBENCH_START();
-
-  for(i = 0; i < N; ++i)
-    a1->coeffs[i] = power2round(&a0->coeffs[i], a->coeffs[i]);
-
-  DBENCH_STOP(*tround);
-}
-
-/*************************************************
-* Name:        poly_decompose
-*
-* Description: For all coefficients c of the input polynomial,
-*              compute high and low bits c0, c1 such c mod Q = c1*ALPHA + c0
-*              with -ALPHA/2 < c0 <= ALPHA/2 except c1 = (Q-1)/ALPHA where we
-*              set c1 = 0 and -ALPHA/2 <= c0 = c mod Q - Q < 0.
-*              Assumes coefficients to be standard representatives.
-*
-* Arguments:   - poly *a1: pointer to output polynomial with coefficients c1
-*              - poly *a0: pointer to output polynomial with coefficients c0
-*              - const poly *a: pointer to input polynomial
-**************************************************/
-void poly_decompose(poly *a1, poly *a0, const poly *a) {
-  unsigned int i;
-  DBENCH_START();
-
-  for(i = 0; i < N; ++i)
-    a1->coeffs[i] = decompose(&a0->coeffs[i], a->coeffs[i]);
-
-  DBENCH_STOP(*tround);
-}
-
-/*************************************************
-* Name:        poly_make_hint
-*
-* Description: Compute hint polynomial. The coefficients of which indicate
-*              whether the low bits of the corresponding coefficient of
-*              the input polynomial overflow into the high bits.
-*
-* Arguments:   - poly *h: pointer to output hint polynomial
-*              - const poly *a0: pointer to low part of input polynomial
-*              - const poly *a1: pointer to high part of input polynomial
-*
-* Returns number of 1 bits.
-**************************************************/
-unsigned int poly_make_hint(poly *h, const poly *a0, const poly *a1) {
-  unsigned int i, s = 0;
-  DBENCH_START();
-
-  for(i = 0; i < N; ++i) {
-    h->coeffs[i] = make_hint(a0->coeffs[i], a1->coeffs[i]);
-    s += h->coeffs[i];
-  }
-
-  DBENCH_STOP(*tround);
-  return s;
-}
-
-/*************************************************
-* Name:        poly_use_hint
-*
-* Description: Use hint polynomial to correct the high bits of a polynomial.
-*
-* Arguments:   - poly *b: pointer to output polynomial with corrected high bits
-*              - const poly *a: pointer to input polynomial
-*              - const poly *h: pointer to input hint polynomial
-**************************************************/
-void poly_use_hint(poly *b, const poly *a, const poly *h) {
-  unsigned int i;
-  DBENCH_START();
-
-  for(i = 0; i < N; ++i)
-    b->coeffs[i] = use_hint(a->coeffs[i], h->coeffs[i]);
-
-  DBENCH_STOP(*tround);
-}
-
-/*************************************************
-* Name:        poly_chknorm
-*
-* Description: Check infinity norm of polynomial against given bound.
-*              Assumes input coefficients were reduced by reduce32().
-*
-* Arguments:   - const poly *a: pointer to polynomial
-*              - int32_t B: norm bound
-*
-* Returns 0 if norm is strictly smaller than B <= (Q-1)/8 and 1 otherwise.
-**************************************************/
-int poly_chknorm(const poly *a, int32_t B) {
-  unsigned int i;
-  int32_t t;
-  DBENCH_START();
-
-  if(B > (Q-1)/8)
-    return 1;
-
-  /* It is ok to leak which coefficient violates the bound since
-     the probability for each coefficient is independent of secret
-     data but we must not leak the sign of the centralized representative. */
-  for(i = 0; i < N; ++i) {
-    /* Absolute value */
-    t = a->coeffs[i] >> 31;
-    t = a->coeffs[i] - (t & 2*a->coeffs[i]);
-
-    if(t >= B) {
-      DBENCH_STOP(*tsample);
-      return 1;
-    }
-  }
-
-  DBENCH_STOP(*tsample);
-  return 0;
-}
-
-/*************************************************
-* Name:        poly_uniform
-*
-* Description: Sample polynomial with uniformly random coefficients
-*              in [0,Q-1] by performing rejection sampling on the
-*              output stream of SHAKE256(seed|nonce) or AES256CTR(seed,nonce).
-*
-* Arguments:   - poly *a: pointer to output polynomial
-*              - const uint8_t seed[]: byte array with seed of length SEEDBYTES
-*              - uint16_t nonce: 2-byte nonce
-**************************************************/
-#define POLY_UNIFORM_NBLOCKS ((768 + STREAM128_BLOCKBYTES - 1)/STREAM128_BLOCKBYTES)
-void poly_uniform(poly *a,
-                  const uint8_t seed[SEEDBYTES],
-                  uint16_t nonce)
-{
-  unsigned int i, ctr, off;
-  unsigned int buflen = POLY_UNIFORM_NBLOCKS*STREAM128_BLOCKBYTES;
-  uint8_t buf[POLY_UNIFORM_NBLOCKS*STREAM128_BLOCKBYTES + 2];
-  stream128_state state;
-
-  stream128_init(&state, seed, nonce);
-  stream128_squeezeblocks(buf, POLY_UNIFORM_NBLOCKS, &state);
-
-  ctr = asm_rej_uniform(a->coeffs, N, buf, buflen);
-
-  while(ctr < N) {
-    off = buflen % 3;
-    for(i = 0; i < off; ++i)
-      buf[i] = buf[buflen - off + i];
-
-    stream128_squeezeblocks(buf + off, 1, &state);
-    buflen = STREAM128_BLOCKBYTES + off;
-    ctr += asm_rej_uniform(a->coeffs + ctr, N - ctr, buf, buflen);
-  }
-}
-
-/*************************************************
-* Name:        rej_eta
-*
-* Description: Sample uniformly random coefficients in [-ETA, ETA] by
-*              performing rejection sampling on array of random bytes.
-*
-* Arguments:   - int32_t *a: pointer to output array (allocated)
-*              - unsigned int len: number of coefficients to be sampled
-*              - const uint8_t *buf: array of random bytes
-*              - unsigned int buflen: length of array of random bytes
-*
-* Returns number of sampled coefficients. Can be smaller than len if not enough
-* random bytes were given.
-**************************************************/
-static unsigned int rej_eta(int32_t *a,
-                            unsigned int len,
-                            const uint8_t *buf,
-                            unsigned int buflen)
-{
-  unsigned int ctr, pos;
-  uint32_t t0, t1;
-  DBENCH_START();
-
-  ctr = pos = 0;
-  while(ctr < len && pos < buflen) {
-    t0 = buf[pos] & 0x0F;
-    t1 = buf[pos++] >> 4;
-
-#if ETA == 2
-    if(t0 < 15) {
-      t0 = t0 - (205*t0 >> 10)*5;
-      a[ctr++] = 2 - t0;
-    }
-    if(t1 < 15 && ctr < len) {
-      t1 = t1 - (205*t1 >> 10)*5;
-      a[ctr++] = 2 - t1;
-    }
-#elif ETA == 4
-    if(t0 < 9)
-      a[ctr++] = 4 - t0;
-    if(t1 < 9 && ctr < len)
-      a[ctr++] = 4 - t1;
-#endif
-  }
-
-  DBENCH_STOP(*tsample);
-  return ctr;
-}
-
-/*************************************************
-* Name:        poly_uniform_eta
-*
-* Description: Sample polynomial with uniformly random coefficients
-*              in [-ETA,ETA] by performing rejection sampling on the
-*              output stream from SHAKE256(seed|nonce) or AES256CTR(seed,nonce).
-*
-* Arguments:   - poly *a: pointer to output polynomial
-*              - const uint8_t seed[]: byte array with seed of length SEEDBYTES
-*              - uint16_t nonce: 2-byte nonce
-**************************************************/
-#if ETA == 2
-#define POLY_UNIFORM_ETA_NBLOCKS ((136 + STREAM128_BLOCKBYTES - 1)/STREAM128_BLOCKBYTES)
-#elif ETA == 4
-#define POLY_UNIFORM_ETA_NBLOCKS ((227 + STREAM128_BLOCKBYTES - 1)/STREAM128_BLOCKBYTES)
-#endif
-void poly_uniform_eta(poly *a,
-                      const uint8_t seed[SEEDBYTES],
-                      uint16_t nonce)
-{
-  unsigned int ctr;
-  unsigned int buflen = POLY_UNIFORM_ETA_NBLOCKS*STREAM128_BLOCKBYTES;
-  uint8_t buf[POLY_UNIFORM_ETA_NBLOCKS*STREAM128_BLOCKBYTES];
-  stream128_state state;
-
-  stream128_init(&state, seed, nonce);
-  stream128_squeezeblocks(buf, POLY_UNIFORM_ETA_NBLOCKS, &state);
-
-  ctr = rej_eta(a->coeffs, N, buf, buflen);
-
-  while(ctr < N) {
-    stream128_squeezeblocks(buf, 1, &state);
-    ctr += rej_eta(a->coeffs + ctr, N - ctr, buf, STREAM128_BLOCKBYTES);
-  }
-}
-
-/*************************************************
-* Name:        poly_uniform_gamma1m1
-*
-* Description: Sample polynomial with uniformly random coefficients
-*              in [-(GAMMA1 - 1), GAMMA1] by unpacking output stream
-*              of SHAKE256(seed|nonce) or AES256CTR(seed,nonce).
-*
-* Arguments:   - poly *a: pointer to output polynomial
-*              - const uint8_t seed[]: byte array with seed of length CRHBYTES
-*              - uint16_t nonce: 16-bit nonce
-**************************************************/
-#define POLY_UNIFORM_GAMMA1_NBLOCKS ((POLYZ_PACKEDBYTES + STREAM256_BLOCKBYTES - 1)/STREAM256_BLOCKBYTES)
-void poly_uniform_gamma1(poly *a,
-                         const uint8_t seed[CRHBYTES],
-                         uint16_t nonce)
-{
-  uint8_t buf[POLY_UNIFORM_GAMMA1_NBLOCKS*STREAM256_BLOCKBYTES];
-  stream256_state state;
-
-  stream256_init(&state, seed, nonce);
-  stream256_squeezeblocks(buf, POLY_UNIFORM_GAMMA1_NBLOCKS, &state);
-  polyz_unpack(a, buf);
-}
-
-/*************************************************
-* Name:        challenge
-*
-* Description: Implementation of H. Samples polynomial with TAU nonzero
-*              coefficients in {-1,1} using the output stream of
-*              SHAKE256(seed).
-*
-* Arguments:   - poly *c: pointer to output polynomial
-*              - const uint8_t mu[]: byte array containing seed of length SEEDBYTES
-**************************************************/
-void poly_challenge(poly *c, const uint8_t seed[SEEDBYTES]) {
-  unsigned int i, b, pos;
-  uint64_t signs;
-  uint8_t buf[SHAKE256_RATE];
-  shake256incctx state;
-
-  shake256_inc_init(&state);
-  shake256_inc_absorb(&state, seed, SEEDBYTES);
-  shake256_inc_finalize(&state);
-  shake256_inc_squeezeblocks(buf, 1, &state);
-
-  signs = 0;
-  for(i = 0; i < 8; ++i)
-    signs |= (uint64_t)buf[i] << 8*i;
-  pos = 8;
-
-  for(i = 0; i < N; ++i)
-    c->coeffs[i] = 0;
-  for(i = N-TAU; i < N; ++i) {
-    do {
-      if(pos >= SHAKE256_RATE) {
-        shake256_inc_squeezeblocks(buf, 1, &state);
-        pos = 0;
-      }
-
-      b = buf[pos++];
-    } while(b > i);
-
-    c->coeffs[i] = c->coeffs[b];
-    c->coeffs[b] = 1 - 2*(signs & 1);
-    signs >>= 1;
-  }
-}
-
-/*************************************************
-* Name:        polyeta_pack
-*
-* Description: Bit-pack polynomial with coefficients in [-ETA,ETA].
-*
-* Arguments:   - uint8_t *r: pointer to output byte array with at least
-*                            POLYETA_PACKEDBYTES bytes
-*              - const poly *a: pointer to input polynomial
-**************************************************/
-void polyeta_pack(uint8_t *r, const poly *a) {
-  unsigned int i;
-  uint8_t t[8];
-  DBENCH_START();
-
-#if ETA == 2
-  for(i = 0; i < N/8; ++i) {
-    t[0] = ETA - a->coeffs[8*i+0];
-    t[1] = ETA - a->coeffs[8*i+1];
-    t[2] = ETA - a->coeffs[8*i+2];
-    t[3] = ETA - a->coeffs[8*i+3];
-    t[4] = ETA - a->coeffs[8*i+4];
-    t[5] = ETA - a->coeffs[8*i+5];
-    t[6] = ETA - a->coeffs[8*i+6];
-    t[7] = ETA - a->coeffs[8*i+7];
-
-    r[3*i+0]  = (t[0] >> 0) | (t[1] << 3) | (t[2] << 6);
-    r[3*i+1]  = (t[2] >> 2) | (t[3] << 1) | (t[4] << 4) | (t[5] << 7);
-    r[3*i+2]  = (t[5] >> 1) | (t[6] << 2) | (t[7] << 5);
-  }
-#elif ETA == 4
-  for(i = 0; i < N/2; ++i) {
-    t[0] = ETA - a->coeffs[2*i+0];
-    t[1] = ETA - a->coeffs[2*i+1];
-    r[i] = t[0] | (t[1] << 4);
-  }
-#endif
-
-  DBENCH_STOP(*tpack);
-}
-
-/*************************************************
-* Name:        polyeta_unpack
-*
-* Description: Unpack polynomial with coefficients in [-ETA,ETA].
-*
-* Arguments:   - poly *r: pointer to output polynomial
-*              - const uint8_t *a: byte array with bit-packed polynomial
-**************************************************/
-void polyeta_unpack(poly *r, const uint8_t *a) {
-  unsigned int i;
-  DBENCH_START();
-
-#if ETA == 2
-  for(i = 0; i < N/8; ++i) {
-    r->coeffs[8*i+0] =  (a[3*i+0] >> 0) & 7;
-    r->coeffs[8*i+1] =  (a[3*i+0] >> 3) & 7;
-    r->coeffs[8*i+2] = ((a[3*i+0] >> 6) | (a[3*i+1] << 2)) & 7;
-    r->coeffs[8*i+3] =  (a[3*i+1] >> 1) & 7;
-    r->coeffs[8*i+4] =  (a[3*i+1] >> 4) & 7;
-    r->coeffs[8*i+5] = ((a[3*i+1] >> 7) | (a[3*i+2] << 1)) & 7;
-    r->coeffs[8*i+6] =  (a[3*i+2] >> 2) & 7;
-    r->coeffs[8*i+7] =  (a[3*i+2] >> 5) & 7;
-
-    r->coeffs[8*i+0] = ETA - r->coeffs[8*i+0];
-    r->coeffs[8*i+1] = ETA - r->coeffs[8*i+1];
-    r->coeffs[8*i+2] = ETA - r->coeffs[8*i+2];
-    r->coeffs[8*i+3] = ETA - r->coeffs[8*i+3];
-    r->coeffs[8*i+4] = ETA - r->coeffs[8*i+4];
-    r->coeffs[8*i+5] = ETA - r->coeffs[8*i+5];
-    r->coeffs[8*i+6] = ETA - r->coeffs[8*i+6];
-    r->coeffs[8*i+7] = ETA - r->coeffs[8*i+7];
-  }
-#elif ETA == 4
-  for(i = 0; i < N/2; ++i) {
-    r->coeffs[2*i+0] = a[i] & 0x0F;
-    r->coeffs[2*i+1] = a[i] >> 4;
-    r->coeffs[2*i+0] = ETA - r->coeffs[2*i+0];
-    r->coeffs[2*i+1] = ETA - r->coeffs[2*i+1];
-  }
-#endif
-
-  DBENCH_STOP(*tpack);
-}
-
-/*************************************************
-* Name:        polyt1_pack
-*
-* Description: Bit-pack polynomial t1 with coefficients fitting in 10 bits.
-*              Input coefficients are assumed to be standard representatives.
-*
-* Arguments:   - uint8_t *r: pointer to output byte array with at least
-*                            POLYT1_PACKEDBYTES bytes
-*              - const poly *a: pointer to input polynomial
-**************************************************/
-void polyt1_pack(uint8_t *r, const poly *a) {
-  unsigned int i;
-  DBENCH_START();
-
-  for(i = 0; i < N/4; ++i) {
-    r[5*i+0] = (a->coeffs[4*i+0] >> 0);
-    r[5*i+1] = (a->coeffs[4*i+0] >> 8) | (a->coeffs[4*i+1] << 2);
-    r[5*i+2] = (a->coeffs[4*i+1] >> 6) | (a->coeffs[4*i+2] << 4);
-    r[5*i+3] = (a->coeffs[4*i+2] >> 4) | (a->coeffs[4*i+3] << 6);
-    r[5*i+4] = (a->coeffs[4*i+3] >> 2);
-  }
-
-  DBENCH_STOP(*tpack);
-}
-
-/*************************************************
-* Name:        polyt1_unpack
-*
-* Description: Unpack polynomial t1 with 10-bit coefficients.
-*              Output coefficients are standard representatives.
-*
-* Arguments:   - poly *r: pointer to output polynomial
-*              - const uint8_t *a: byte array with bit-packed polynomial
-**************************************************/
-void polyt1_unpack(poly *r, const uint8_t *a) {
-  unsigned int i;
-  DBENCH_START();
-
-  for(i = 0; i < N/4; ++i) {
-    r->coeffs[4*i+0] = ((a[5*i+0] >> 0) | ((uint32_t)a[5*i+1] << 8)) & 0x3FF;
-    r->coeffs[4*i+1] = ((a[5*i+1] >> 2) | ((uint32_t)a[5*i+2] << 6)) & 0x3FF;
-    r->coeffs[4*i+2] = ((a[5*i+2] >> 4) | ((uint32_t)a[5*i+3] << 4)) & 0x3FF;
-    r->coeffs[4*i+3] = ((a[5*i+3] >> 6) | ((uint32_t)a[5*i+4] << 2)) & 0x3FF;
-  }
-
-  DBENCH_STOP(*tpack);
-}
-
-/*************************************************
-* Name:        polyt0_pack
-*
-* Description: Bit-pack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}].
-*
-* Arguments:   - uint8_t *r: pointer to output byte array with at least
-*                            POLYT0_PACKEDBYTES bytes
-*              - const poly *a: pointer to input polynomial
-**************************************************/
-void polyt0_pack(uint8_t *r, const poly *a) {
-  unsigned int i;
-  uint32_t t[8];
-  DBENCH_START();
-
-  for(i = 0; i < N/8; ++i) {
-    t[0] = (1 << (D-1)) - a->coeffs[8*i+0];
-    t[1] = (1 << (D-1)) - a->coeffs[8*i+1];
-    t[2] = (1 << (D-1)) - a->coeffs[8*i+2];
-    t[3] = (1 << (D-1)) - a->coeffs[8*i+3];
-    t[4] = (1 << (D-1)) - a->coeffs[8*i+4];
-    t[5] = (1 << (D-1)) - a->coeffs[8*i+5];
-    t[6] = (1 << (D-1)) - a->coeffs[8*i+6];
-    t[7] = (1 << (D-1)) - a->coeffs[8*i+7];
-
-    r[13*i+ 0]  =  t[0];
-    r[13*i+ 1]  =  t[0] >>  8;
-    r[13*i+ 1] |=  t[1] <<  5;
-    r[13*i+ 2]  =  t[1] >>  3;
-    r[13*i+ 3]  =  t[1] >> 11;
-    r[13*i+ 3] |=  t[2] <<  2;
-    r[13*i+ 4]  =  t[2] >>  6;
-    r[13*i+ 4] |=  t[3] <<  7;
-    r[13*i+ 5]  =  t[3] >>  1;
-    r[13*i+ 6]  =  t[3] >>  9;
-    r[13*i+ 6] |=  t[4] <<  4;
-    r[13*i+ 7]  =  t[4] >>  4;
-    r[13*i+ 8]  =  t[4] >> 12;
-    r[13*i+ 8] |=  t[5] <<  1;
-    r[13*i+ 9]  =  t[5] >>  7;
-    r[13*i+ 9] |=  t[6] <<  6;
-    r[13*i+10]  =  t[6] >>  2;
-    r[13*i+11]  =  t[6] >> 10;
-    r[13*i+11] |=  t[7] <<  3;
-    r[13*i+12]  =  t[7] >>  5;
-  }
-
-  DBENCH_STOP(*tpack);
-}
-
-/*************************************************
-* Name:        polyt0_unpack
-*
-* Description: Unpack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}].
-*
-* Arguments:   - poly *r: pointer to output polynomial
-*              - const uint8_t *a: byte array with bit-packed polynomial
-**************************************************/
-void polyt0_unpack(poly *r, const uint8_t *a) {
-  unsigned int i;
-  DBENCH_START();
-
-  for(i = 0; i < N/8; ++i) {
-    r->coeffs[8*i+0]  = a[13*i+0];
-    r->coeffs[8*i+0] |= (uint32_t)a[13*i+1] << 8;
-    r->coeffs[8*i+0] &= 0x1FFF;
-
-    r->coeffs[8*i+1]  = a[13*i+1] >> 5;
-    r->coeffs[8*i+1] |= (uint32_t)a[13*i+2] << 3;
-    r->coeffs[8*i+1] |= (uint32_t)a[13*i+3] << 11;
-    r->coeffs[8*i+1] &= 0x1FFF;
-
-    r->coeffs[8*i+2]  = a[13*i+3] >> 2;
-    r->coeffs[8*i+2] |= (uint32_t)a[13*i+4] << 6;
-    r->coeffs[8*i+2] &= 0x1FFF;
-
-    r->coeffs[8*i+3]  = a[13*i+4] >> 7;
-    r->coeffs[8*i+3] |= (uint32_t)a[13*i+5] << 1;
-    r->coeffs[8*i+3] |= (uint32_t)a[13*i+6] << 9;
-    r->coeffs[8*i+3] &= 0x1FFF;
-
-    r->coeffs[8*i+4]  = a[13*i+6] >> 4;
-    r->coeffs[8*i+4] |= (uint32_t)a[13*i+7] << 4;
-    r->coeffs[8*i+4] |= (uint32_t)a[13*i+8] << 12;
-    r->coeffs[8*i+4] &= 0x1FFF;
-
-    r->coeffs[8*i+5]  = a[13*i+8] >> 1;
-    r->coeffs[8*i+5] |= (uint32_t)a[13*i+9] << 7;
-    r->coeffs[8*i+5] &= 0x1FFF;
-
-    r->coeffs[8*i+6]  = a[13*i+9] >> 6;
-    r->coeffs[8*i+6] |= (uint32_t)a[13*i+10] << 2;
-    r->coeffs[8*i+6] |= (uint32_t)a[13*i+11] << 10;
-    r->coeffs[8*i+6] &= 0x1FFF;
-
-    r->coeffs[8*i+7]  = a[13*i+11] >> 3;
-    r->coeffs[8*i+7] |= (uint32_t)a[13*i+12] << 5;
-    r->coeffs[8*i+7] &= 0x1FFF;
-
-    r->coeffs[8*i+0] = (1 << (D-1)) - r->coeffs[8*i+0];
-    r->coeffs[8*i+1] = (1 << (D-1)) - r->coeffs[8*i+1];
-    r->coeffs[8*i+2] = (1 << (D-1)) - r->coeffs[8*i+2];
-    r->coeffs[8*i+3] = (1 << (D-1)) - r->coeffs[8*i+3];
-    r->coeffs[8*i+4] = (1 << (D-1)) - r->coeffs[8*i+4];
-    r->coeffs[8*i+5] = (1 << (D-1)) - r->coeffs[8*i+5];
-    r->coeffs[8*i+6] = (1 << (D-1)) - r->coeffs[8*i+6];
-    r->coeffs[8*i+7] = (1 << (D-1)) - r->coeffs[8*i+7];
-  }
-
-  DBENCH_STOP(*tpack);
-}
-
-/*************************************************
-* Name:        polyz_pack
-*
-* Description: Bit-pack polynomial with coefficients
-*              in [-(GAMMA1 - 1), GAMMA1].
-*
-* Arguments:   - uint8_t *r: pointer to output byte array with at least
-*                            POLYZ_PACKEDBYTES bytes
-*              - const poly *a: pointer to input polynomial
-**************************************************/
-void polyz_pack(uint8_t *r, const poly *a) {
-  unsigned int i;
-  uint32_t t[4];
-  DBENCH_START();
-
-#if GAMMA1 == (1 << 17)
-  for(i = 0; i < N/4; ++i) {
-    t[0] = GAMMA1 - a->coeffs[4*i+0];
-    t[1] = GAMMA1 - a->coeffs[4*i+1];
-    t[2] = GAMMA1 - a->coeffs[4*i+2];
-    t[3] = GAMMA1 - a->coeffs[4*i+3];
-
-    r[9*i+0]  = t[0];
-    r[9*i+1]  = t[0] >> 8;
-    r[9*i+2]  = t[0] >> 16;
-    r[9*i+2] |= t[1] << 2;
-    r[9*i+3]  = t[1] >> 6;
-    r[9*i+4]  = t[1] >> 14;
-    r[9*i+4] |= t[2] << 4;
-    r[9*i+5]  = t[2] >> 4;
-    r[9*i+6]  = t[2] >> 12;
-    r[9*i+6] |= t[3] << 6;
-    r[9*i+7]  = t[3] >> 2;
-    r[9*i+8]  = t[3] >> 10;
-  }
-#elif GAMMA1 == (1 << 19)
-  for(i = 0; i < N/2; ++i) {
-    t[0] = GAMMA1 - a->coeffs[2*i+0];
-    t[1] = GAMMA1 - a->coeffs[2*i+1];
-
-    r[5*i+0]  = t[0];
-    r[5*i+1]  = t[0] >> 8;
-    r[5*i+2]  = t[0] >> 16;
-    r[5*i+2] |= t[1] << 4;
-    r[5*i+3]  = t[1] >> 4;
-    r[5*i+4]  = t[1] >> 12;
-  }
-#endif
-
-  DBENCH_STOP(*tpack);
-}
-
-/*************************************************
-* Name:        polyz_unpack
-*
-* Description: Unpack polynomial z with coefficients
-*              in [-(GAMMA1 - 1), GAMMA1].
-*
-* Arguments:   - poly *r: pointer to output polynomial
-*              - const uint8_t *a: byte array with bit-packed polynomial
-**************************************************/
-void polyz_unpack(poly *r, const uint8_t *a) {
-  unsigned int i;
-  DBENCH_START();
-
-#if GAMMA1 == (1 << 17)
-  for(i = 0; i < N/4; ++i) {
-    r->coeffs[4*i+0]  = a[9*i+0];
-    r->coeffs[4*i+0] |= (uint32_t)a[9*i+1] << 8;
-    r->coeffs[4*i+0] |= (uint32_t)a[9*i+2] << 16;
-    r->coeffs[4*i+0] &= 0x3FFFF;
-
-    r->coeffs[4*i+1]  = a[9*i+2] >> 2;
-    r->coeffs[4*i+1] |= (uint32_t)a[9*i+3] << 6;
-    r->coeffs[4*i+1] |= (uint32_t)a[9*i+4] << 14;
-    r->coeffs[4*i+1] &= 0x3FFFF;
-
-    r->coeffs[4*i+2]  = a[9*i+4] >> 4;
-    r->coeffs[4*i+2] |= (uint32_t)a[9*i+5] << 4;
-    r->coeffs[4*i+2] |= (uint32_t)a[9*i+6] << 12;
-    r->coeffs[4*i+2] &= 0x3FFFF;
-
-    r->coeffs[4*i+3]  = a[9*i+6] >> 6;
-    r->coeffs[4*i+3] |= (uint32_t)a[9*i+7] << 2;
-    r->coeffs[4*i+3] |= (uint32_t)a[9*i+8] << 10;
-    r->coeffs[4*i+3] &= 0x3FFFF;
-
-    r->coeffs[4*i+0] = GAMMA1 - r->coeffs[4*i+0];
-    r->coeffs[4*i+1] = GAMMA1 - r->coeffs[4*i+1];
-    r->coeffs[4*i+2] = GAMMA1 - r->coeffs[4*i+2];
-    r->coeffs[4*i+3] = GAMMA1 - r->coeffs[4*i+3];
-  }
-#elif GAMMA1 == (1 << 19)
-  for(i = 0; i < N/2; ++i) {
-    r->coeffs[2*i+0]  = a[5*i+0];
-    r->coeffs[2*i+0] |= (uint32_t)a[5*i+1] << 8;
-    r->coeffs[2*i+0] |= (uint32_t)a[5*i+2] << 16;
-    r->coeffs[2*i+0] &= 0xFFFFF;
-
-    r->coeffs[2*i+1]  = a[5*i+2] >> 4;
-    r->coeffs[2*i+1] |= (uint32_t)a[5*i+3] << 4;
-    r->coeffs[2*i+1] |= (uint32_t)a[5*i+4] << 12;
-    r->coeffs[2*i+0] &= 0xFFFFF;
-
-    r->coeffs[2*i+0] = GAMMA1 - r->coeffs[2*i+0];
-    r->coeffs[2*i+1] = GAMMA1 - r->coeffs[2*i+1];
-  }
-#endif
-
-  DBENCH_STOP(*tpack);
-}
-
-/*************************************************
-* Name:        polyw1_pack
-*
-* Description: Bit-pack polynomial w1 with coefficients in [0,15] or [0,43].
-*              Input coefficients are assumed to be standard representatives.
-*
-* Arguments:   - uint8_t *r: pointer to output byte array with at least
-*                            POLYW1_PACKEDBYTES bytes
-*              - const poly *a: pointer to input polynomial
-**************************************************/
-void polyw1_pack(uint8_t *r, const poly *a) {
-  unsigned int i;
-  DBENCH_START();
-
-#if GAMMA2 == (Q-1)/88
-  for(i = 0; i < N/4; ++i) {
-    r[3*i+0]  = a->coeffs[4*i+0];
-    r[3*i+0] |= a->coeffs[4*i+1] << 6;
-    r[3*i+1]  = a->coeffs[4*i+1] >> 2;
-    r[3*i+1] |= a->coeffs[4*i+2] << 4;
-    r[3*i+2]  = a->coeffs[4*i+2] >> 4;
-    r[3*i+2] |= a->coeffs[4*i+3] << 2;
-  }
-#elif GAMMA2 == (Q-1)/32
-  for(i = 0; i < N/2; ++i)
-    r[i] = a->coeffs[2*i+0] | (a->coeffs[2*i+1] << 4);
-#endif
-
-  DBENCH_STOP(*tpack);
-}
diff --git a/src/dilithium/poly.h b/src/dilithium/poly.h
deleted file mode 100644
index fc473ac..0000000
--- a/src/dilithium/poly.h
+++ /dev/null
@@ -1,83 +0,0 @@
-#ifndef POLY_H
-#define POLY_H
-
-#include <stdint.h>
-#include "params.h"
-
-typedef struct {
-  int32_t coeffs[N];
-} poly;
-
-#define poly_reduce DILITHIUM_NAMESPACE(poly_reduce)
-void poly_reduce(poly *a);
-#define poly_caddq DILITHIUM_NAMESPACE(poly_caddq)
-void poly_caddq(poly *a);
-#define poly_freeze DILITHIUM_NAMESPACE(poly_freeze)
-void poly_freeze(poly *a);
-
-#define poly_add DILITHIUM_NAMESPACE(poly_add)
-void poly_add(poly *c, const poly *a, const poly *b);
-#define poly_sub DILITHIUM_NAMESPACE(poly_sub)
-void poly_sub(poly *c, const poly *a, const poly *b);
-#define poly_shiftl DILITHIUM_NAMESPACE(poly_shiftl)
-void poly_shiftl(poly *a);
-
-#define poly_ntt DILITHIUM_NAMESPACE(poly_ntt)
-void poly_ntt(poly *a);
-#define poly_invntt_tomont DILITHIUM_NAMESPACE(poly_invntt_tomont)
-void poly_invntt_tomont(poly *a);
-#define poly_pointwise_montgomery DILITHIUM_NAMESPACE(poly_pointwise_montgomery)
-void poly_pointwise_montgomery(poly *c, const poly *a, const poly *b);
-#define poly_pointwise_acc_montgomery DILITHIUM_NAMESPACE(poly_pointwise_acc_montgomery)
-void poly_pointwise_acc_montgomery(poly *c, const poly *a, const poly *b);
-
-#define poly_power2round DILITHIUM_NAMESPACE(poly_power2round)
-void poly_power2round(poly *a1, poly *a0, const poly *a);
-#define poly_decompose DILITHIUM_NAMESPACE(poly_decompose)
-void poly_decompose(poly *a1, poly *a0, const poly *a);
-#define poly_make_hint DILITHIUM_NAMESPACE(poly_make_hint)
-unsigned int poly_make_hint(poly *h, const poly *a0, const poly *a1);
-#define poly_use_hint DILITHIUM_NAMESPACE(poly_use_hint)
-void poly_use_hint(poly *b, const poly *a, const poly *h);
-
-#define poly_chknorm DILITHIUM_NAMESPACE(poly_chknorm)
-int poly_chknorm(const poly *a, int32_t B);
-#define poly_uniform DILITHIUM_NAMESPACE(poly_uniform)
-void poly_uniform(poly *a,
-                  const uint8_t seed[SEEDBYTES],
-                  uint16_t nonce);
-#define poly_uniform_eta DILITHIUM_NAMESPACE(poly_uniform_eta)
-void poly_uniform_eta(poly *a,
-                      const uint8_t seed[SEEDBYTES],
-                      uint16_t nonce);
-#define poly_uniform_gamma1 DILITHIUM_NAMESPACE(poly_uniform_gamma1)
-void poly_uniform_gamma1(poly *a,
-                         const uint8_t seed[CRHBYTES],
-                         uint16_t nonce);
-#define poly_challenge DILITHIUM_NAMESPACE(poly_challenge)
-void poly_challenge(poly *c, const uint8_t seed[SEEDBYTES]);
-
-#define polyeta_pack DILITHIUM_NAMESPACE(polyeta_pack)
-void polyeta_pack(uint8_t *r, const poly *a);
-#define polyeta_unpack DILITHIUM_NAMESPACE(polyeta_unpack)
-void polyeta_unpack(poly *r, const uint8_t *a);
-
-#define polyt1_pack DILITHIUM_NAMESPACE(polyt1_pack)
-void polyt1_pack(uint8_t *r, const poly *a);
-#define polyt1_unpack DILITHIUM_NAMESPACE(polyt1_unpack)
-void polyt1_unpack(poly *r, const uint8_t *a);
-
-#define polyt0_pack DILITHIUM_NAMESPACE(polyt0_pack)
-void polyt0_pack(uint8_t *r, const poly *a);
-#define polyt0_unpack DILITHIUM_NAMESPACE(polyt0_unpack)
-void polyt0_unpack(poly *r, const uint8_t *a);
-
-#define polyz_pack DILITHIUM_NAMESPACE(polyz_pack)
-void polyz_pack(uint8_t *r, const poly *a);
-#define polyz_unpack DILITHIUM_NAMESPACE(polyz_unpack)
-void polyz_unpack(poly *r, const uint8_t *a);
-
-#define polyw1_pack DILITHIUM_NAMESPACE(polyw1_pack)
-void polyw1_pack(uint8_t *r, const poly *a);
-
-#endif
diff --git a/src/dilithium/polyvec.c b/src/dilithium/polyvec.c
deleted file mode 100644
index e6d900e..0000000
--- a/src/dilithium/polyvec.c
+++ /dev/null
@@ -1,422 +0,0 @@
-#include <stdint.h>
-#include "params.h"
-#include "polyvec.h"
-#include "poly.h"
-
-/*************************************************
-* Name:        expand_mat
-*
-* Description: Implementation of ExpandA. Generates matrix A with uniformly
-*              random coefficients a_{i,j} by performing rejection
-*              sampling on the output stream of SHAKE128(rho|j|i)
-*              or AES256CTR(rho,j|i).
-*
-* Arguments:   - polyvecl mat[K]: output matrix
-*              - const uint8_t rho[]: byte array containing seed rho
-**************************************************/
-void polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]) {
-  unsigned int i, j;
-
-  for(i = 0; i < K; ++i)
-    for(j = 0; j < L; ++j)
-      poly_uniform(&mat[i].vec[j], rho, (i << 8) + j);
-}
-
-void polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v) {
-  unsigned int i;
-
-  for(i = 0; i < K; ++i)
-    polyvecl_pointwise_acc_montgomery(&t->vec[i], &mat[i], v);
-}
-
-/**************************************************************/
-/************ Vectors of polynomials of length L **************/
-/**************************************************************/
-
-void polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) {
-  unsigned int i;
-
-  for(i = 0; i < L; ++i)
-    poly_uniform_eta(&v->vec[i], seed, nonce++);
-}
-
-void polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[CRHBYTES], uint16_t nonce) {
-  unsigned int i;
-
-  for(i = 0; i < L; ++i)
-    poly_uniform_gamma1(&v->vec[i], seed, L*nonce + i);
-}
-
-void polyvecl_reduce(polyvecl *v) {
-  unsigned int i;
-
-  for(i = 0; i < L; ++i)
-    poly_reduce(&v->vec[i]);
-}
-
-#if 0
-/*************************************************
-* Name:        polyvecl_freeze
-*
-* Description: Reduce coefficients of polynomials in vector of length L
-*              to standard representatives.
-*
-* Arguments:   - polyvecl *v: pointer to input/output vector
-**************************************************/
-void polyvecl_freeze(polyvecl *v) {
-  unsigned int i;
-
-  for(i = 0; i < L; ++i)
-    poly_freeze(&v->vec[i]);
-}
-#endif
-
-/*************************************************
-* Name:        polyvecl_add
-*
-* Description: Add vectors of polynomials of length L.
-*              No modular reduction is performed.
-*
-* Arguments:   - polyvecl *w: pointer to output vector
-*              - const polyvecl *u: pointer to first summand
-*              - const polyvecl *v: pointer to second summand
-**************************************************/
-void polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v) {
-  unsigned int i;
-
-  for(i = 0; i < L; ++i)
-    poly_add(&w->vec[i], &u->vec[i], &v->vec[i]);
-}
-
-/*************************************************
-* Name:        polyvecl_ntt
-*
-* Description: Forward NTT of all polynomials in vector of length L. Output
-*              coefficients can be up to 16*Q larger than input coefficients.
-*
-* Arguments:   - polyvecl *v: pointer to input/output vector
-**************************************************/
-void polyvecl_ntt(polyvecl *v) {
-  unsigned int i;
-
-  for(i = 0; i < L; ++i)
-    poly_ntt(&v->vec[i]);
-}
-
-void polyvecl_invntt_tomont(polyvecl *v) {
-  unsigned int i;
-
-  for(i = 0; i < L; ++i)
-    poly_invntt_tomont(&v->vec[i]);
-}
-
-void polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v) {
-  unsigned int i;
-
-  for(i = 0; i < L; ++i)
-    poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]);
-}
-
-/*************************************************
-* Name:        polyvecl_pointwise_acc_montgomery
-*
-* Description: Pointwise multiply vectors of polynomials of length L, multiply
-*              resulting vector by 2^{-32} and add (accumulate) polynomials
-*              in it. Input/output vectors are in NTT domain representation.
-*
-* Arguments:   - poly *w: output polynomial
-*              - const polyvecl *u: pointer to first input vector
-*              - const polyvecl *v: pointer to second input vector
-**************************************************/
-void polyvecl_pointwise_acc_montgomery(poly *w,
-                                       const polyvecl *u,
-                                       const polyvecl *v)
-{
-  unsigned int i;
-
-  poly_pointwise_montgomery(w, &u->vec[0], &v->vec[0]);
-  for(i = 1; i < L; ++i) {
-    poly_pointwise_acc_montgomery(w, &u->vec[i], &v->vec[i]);
-  }
-}
-
-/*************************************************
-* Name:        polyvecl_chknorm
-*
-* Description: Check infinity norm of polynomials in vector of length L.
-*              Assumes input polyvecl to be reduced by polyvecl_reduce().
-*
-* Arguments:   - const polyvecl *v: pointer to vector
-*              - int32_t B: norm bound
-*
-* Returns 0 if norm of all polynomials is strictly smaller than B <= (Q-1)/8
-* and 1 otherwise.
-**************************************************/
-int polyvecl_chknorm(const polyvecl *v, int32_t bound)  {
-  unsigned int i;
-
-  for(i = 0; i < L; ++i)
-    if(poly_chknorm(&v->vec[i], bound))
-      return 1;
-
-  return 0;
-}
-
-/**************************************************************/
-/************ Vectors of polynomials of length K **************/
-/**************************************************************/
-
-void polyveck_uniform_eta(polyveck *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) {
-  unsigned int i;
-
-  for(i = 0; i < K; ++i)
-    poly_uniform_eta(&v->vec[i], seed, nonce++);
-}
-
-/*************************************************
-* Name:        polyveck_reduce
-*
-* Description: Reduce coefficients of polynomials in vector of length K
-*              to representatives in [-6283009,6283007].
-*
-* Arguments:   - polyveck *v: pointer to input/output vector
-**************************************************/
-void polyveck_reduce(polyveck *v) {
-  unsigned int i;
-
-  for(i = 0; i < K; ++i)
-    poly_reduce(&v->vec[i]);
-}
-
-/*************************************************
-* Name:        polyveck_caddq
-*
-* Description: For all coefficients of polynomials in vector of length K
-*              add Q if coefficient is negative.
-*
-* Arguments:   - polyveck *v: pointer to input/output vector
-**************************************************/
-void polyveck_caddq(polyveck *v) {
-  unsigned int i;
-
-  for(i = 0; i < K; ++i)
-    poly_caddq(&v->vec[i]);
-}
-
-#if 0
-/*************************************************
-* Name:        polyveck_freeze
-*
-* Description: Reduce coefficients of polynomials in vector of length K
-*              to standard representatives.
-*
-* Arguments:   - polyveck *v: pointer to input/output vector
-**************************************************/
-void polyveck_freeze(polyveck *v)  {
-  unsigned int i;
-
-  for(i = 0; i < K; ++i)
-    poly_freeze(&v->vec[i]);
-}
-#endif
-
-/*************************************************
-* Name:        polyveck_add
-*
-* Description: Add vectors of polynomials of length K.
-*              No modular reduction is performed.
-*
-* Arguments:   - polyveck *w: pointer to output vector
-*              - const polyveck *u: pointer to first summand
-*              - const polyveck *v: pointer to second summand
-**************************************************/
-void polyveck_add(polyveck *w, const polyveck *u, const polyveck *v) {
-  unsigned int i;
-
-  for(i = 0; i < K; ++i)
-    poly_add(&w->vec[i], &u->vec[i], &v->vec[i]);
-}
-
-/*************************************************
-* Name:        polyveck_sub
-*
-* Description: Subtract vectors of polynomials of length K.
-*              No modular reduction is performed.
-*
-* Arguments:   - polyveck *w: pointer to output vector
-*              - const polyveck *u: pointer to first input vector
-*              - const polyveck *v: pointer to second input vector to be
-*                                   subtracted from first input vector
-**************************************************/
-void polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v) {
-  unsigned int i;
-
-  for(i = 0; i < K; ++i)
-    poly_sub(&w->vec[i], &u->vec[i], &v->vec[i]);
-}
-
-/*************************************************
-* Name:        polyveck_shiftl
-*
-* Description: Multiply vector of polynomials of Length K by 2^D without modular
-*              reduction. Assumes input coefficients to be less than 2^{31-D}.
-*
-* Arguments:   - polyveck *v: pointer to input/output vector
-**************************************************/
-void polyveck_shiftl(polyveck *v) {
-  unsigned int i;
-
-  for(i = 0; i < K; ++i)
-    poly_shiftl(&v->vec[i]);
-}
-
-/*************************************************
-* Name:        polyveck_ntt
-*
-* Description: Forward NTT of all polynomials in vector of length K. Output
-*              coefficients can be up to 16*Q larger than input coefficients.
-*
-* Arguments:   - polyveck *v: pointer to input/output vector
-**************************************************/
-void polyveck_ntt(polyveck *v) {
-  unsigned int i;
-
-  for(i = 0; i < K; ++i)
-    poly_ntt(&v->vec[i]);
-}
-
-/*************************************************
-* Name:        polyveck_invntt_tomont
-*
-* Description: Inverse NTT and multiplication by 2^{32} of polynomials
-*              in vector of length K. Input coefficients need to be less
-*              than 2*Q.
-*
-* Arguments:   - polyveck *v: pointer to input/output vector
-**************************************************/
-void polyveck_invntt_tomont(polyveck *v) {
-  unsigned int i;
-
-  for(i = 0; i < K; ++i)
-    poly_invntt_tomont(&v->vec[i]);
-}
-
-void polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v) {
-  unsigned int i;
-
-  for(i = 0; i < K; ++i)
-    poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]);
-}
-
-
-/*************************************************
-* Name:        polyveck_chknorm
-*
-* Description: Check infinity norm of polynomials in vector of length K.
-*              Assumes input polyveck to be reduced by polyveck_reduce().
-*
-* Arguments:   - const polyveck *v: pointer to vector
-*              - int32_t B: norm bound
-*
-* Returns 0 if norm of all polynomials are strictly smaller than B <= (Q-1)/8
-* and 1 otherwise.
-**************************************************/
-int polyveck_chknorm(const polyveck *v, int32_t bound) {
-  unsigned int i;
-
-  for(i = 0; i < K; ++i)
-    if(poly_chknorm(&v->vec[i], bound))
-      return 1;
-
-  return 0;
-}
-
-/*************************************************
-* Name:        polyveck_power2round
-*
-* Description: For all coefficients a of polynomials in vector of length K,
-*              compute a0, a1 such that a mod^+ Q = a1*2^D + a0
-*              with -2^{D-1} < a0 <= 2^{D-1}. Assumes coefficients to be
-*              standard representatives.
-*
-* Arguments:   - polyveck *v1: pointer to output vector of polynomials with
-*                              coefficients a1
-*              - polyveck *v0: pointer to output vector of polynomials with
-*                              coefficients a0
-*              - const polyveck *v: pointer to input vector
-**************************************************/
-void polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v) {
-  unsigned int i;
-
-  for(i = 0; i < K; ++i)
-    poly_power2round(&v1->vec[i], &v0->vec[i], &v->vec[i]);
-}
-
-/*************************************************
-* Name:        polyveck_decompose
-*
-* Description: For all coefficients a of polynomials in vector of length K,
-*              compute high and low bits a0, a1 such a mod^+ Q = a1*ALPHA + a0
-*              with -ALPHA/2 < a0 <= ALPHA/2 except a1 = (Q-1)/ALPHA where we
-*              set a1 = 0 and -ALPHA/2 <= a0 = a mod Q - Q < 0.
-*              Assumes coefficients to be standard representatives.
-*
-* Arguments:   - polyveck *v1: pointer to output vector of polynomials with
-*                              coefficients a1
-*              - polyveck *v0: pointer to output vector of polynomials with
-*                              coefficients a0
-*              - const polyveck *v: pointer to input vector
-**************************************************/
-void polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v) {
-  unsigned int i;
-
-  for(i = 0; i < K; ++i)
-    poly_decompose(&v1->vec[i], &v0->vec[i], &v->vec[i]);
-}
-
-/*************************************************
-* Name:        polyveck_make_hint
-*
-* Description: Compute hint vector.
-*
-* Arguments:   - polyveck *h: pointer to output vector
-*              - const polyveck *v0: pointer to low part of input vector
-*              - const polyveck *v1: pointer to high part of input vector
-*
-* Returns number of 1 bits.
-**************************************************/
-unsigned int polyveck_make_hint(polyveck *h,
-                                const polyveck *v0,
-                                const polyveck *v1)
-{
-  unsigned int i, s = 0;
-
-  for(i = 0; i < K; ++i)
-    s += poly_make_hint(&h->vec[i], &v0->vec[i], &v1->vec[i]);
-
-  return s;
-}
-
-/*************************************************
-* Name:        polyveck_use_hint
-*
-* Description: Use hint vector to correct the high bits of input vector.
-*
-* Arguments:   - polyveck *w: pointer to output vector of polynomials with
-*                             corrected high bits
-*              - const polyveck *u: pointer to input vector
-*              - const polyveck *h: pointer to input hint vector
-**************************************************/
-void polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h) {
-  unsigned int i;
-
-  for(i = 0; i < K; ++i)
-    poly_use_hint(&w->vec[i], &u->vec[i], &h->vec[i]);
-}
-
-void polyveck_pack_w1(uint8_t r[K*POLYW1_PACKEDBYTES], const polyveck *w1) {
-  unsigned int i;
-
-  for(i = 0; i < K; ++i)
-    polyw1_pack(&r[i*POLYW1_PACKEDBYTES], &w1->vec[i]);
-}
diff --git a/src/dilithium/polyvec.h b/src/dilithium/polyvec.h
deleted file mode 100644
index e294ba7..0000000
--- a/src/dilithium/polyvec.h
+++ /dev/null
@@ -1,98 +0,0 @@
-#ifndef POLYVEC_H
-#define POLYVEC_H
-
-#include <stdint.h>
-#include "params.h"
-#include "poly.h"
-
-/* Vectors of polynomials of length L */
-typedef struct {
-  poly vec[L];
-} polyvecl;
-
-#define polyvecl_uniform_eta DILITHIUM_NAMESPACE(polyvecl_uniform_eta)
-void polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce);
-
-#define polyvecl_uniform_gamma1 DILITHIUM_NAMESPACE(polyvecl_uniform_gamma1)
-void polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[CRHBYTES], uint16_t nonce);
-
-#define polyvecl_reduce DILITHIUM_NAMESPACE(polyvecl_reduce)
-void polyvecl_reduce(polyvecl *v);
-
-#define polyvecl_freeze DILITHIUM_NAMESPACE(polyvecl_freeze)
-void polyvecl_freeze(polyvecl *v);
-
-#define polyvecl_add DILITHIUM_NAMESPACE(polyvecl_add)
-void polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v);
-
-#define polyvecl_ntt DILITHIUM_NAMESPACE(polyvecl_ntt)
-void polyvecl_ntt(polyvecl *v);
-#define polyvecl_invntt_tomont DILITHIUM_NAMESPACE(polyvecl_invntt_tomont)
-void polyvecl_invntt_tomont(polyvecl *v);
-#define polyvecl_pointwise_poly_montgomery DILITHIUM_NAMESPACE(polyvecl_pointwise_poly_montgomery)
-void polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v);
-#define polyvecl_pointwise_acc_montgomery \
-        DILITHIUM_NAMESPACE(polyvecl_pointwise_acc_montgomery)
-void polyvecl_pointwise_acc_montgomery(poly *w,
-                                       const polyvecl *u,
-                                       const polyvecl *v);
-
-
-#define polyvecl_chknorm DILITHIUM_NAMESPACE(polyvecl_chknorm)
-int polyvecl_chknorm(const polyvecl *v, int32_t B);
-
-
-
-/* Vectors of polynomials of length K */
-typedef struct {
-  poly vec[K];
-} polyveck;
-
-#define polyveck_uniform_eta DILITHIUM_NAMESPACE(polyveck_uniform_eta)
-void polyveck_uniform_eta(polyveck *v, const uint8_t seed[SEEDBYTES], uint16_t nonce);
-
-#define polyveck_reduce DILITHIUM_NAMESPACE(polyveck_reduce)
-void polyveck_reduce(polyveck *v);
-#define polyveck_caddq DILITHIUM_NAMESPACE(polyveck_caddq)
-void polyveck_caddq(polyveck *v);
-#define polyveck_freeze DILITHIUM_NAMESPACE(polyveck_freeze)
-void polyveck_freeze(polyveck *v);
-
-#define polyveck_add DILITHIUM_NAMESPACE(polyveck_add)
-void polyveck_add(polyveck *w, const polyveck *u, const polyveck *v);
-#define polyveck_sub DILITHIUM_NAMESPACE(polyveck_sub)
-void polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v);
-#define polyveck_shiftl DILITHIUM_NAMESPACE(polyveck_shiftl)
-void polyveck_shiftl(polyveck *v);
-
-#define polyveck_ntt DILITHIUM_NAMESPACE(polyveck_ntt)
-void polyveck_ntt(polyveck *v);
-#define polyveck_invntt_tomont DILITHIUM_NAMESPACE(polyveck_invntt_tomont)
-void polyveck_invntt_tomont(polyveck *v);
-#define polyveck_pointwise_poly_montgomery DILITHIUM_NAMESPACE(polyveck_pointwise_poly_montgomery)
-void polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v);
-
-#define polyveck_chknorm DILITHIUM_NAMESPACE(polyveck_chknorm)
-int polyveck_chknorm(const polyveck *v, int32_t B);
-
-#define polyveck_power2round DILITHIUM_NAMESPACE(polyveck_power2round)
-void polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v);
-#define polyveck_decompose DILITHIUM_NAMESPACE(polyveck_decompose)
-void polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v);
-#define polyveck_make_hint DILITHIUM_NAMESPACE(polyveck_make_hint)
-unsigned int polyveck_make_hint(polyveck *h,
-                                const polyveck *v0,
-                                const polyveck *v1);
-#define polyveck_use_hint DILITHIUM_NAMESPACE(polyveck_use_hint)
-void polyveck_use_hint(polyveck *w, const polyveck *v, const polyveck *h);
-
-#define polyveck_pack_w1 DILITHIUM_NAMESPACE(polyveck_pack_w1)
-void polyveck_pack_w1(uint8_t r[K*POLYW1_PACKEDBYTES], const polyveck *w1);
-
-#define polyvec_matrix_expand DILITHIUM_NAMESPACE(polyvec_matrix_expand)
-void polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]);
-
-#define polyvec_matrix_pointwise_montgomery DILITHIUM_NAMESPACE(polyvec_matrix_pointwise_montgomery)
-void polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v);
-
-#endif
diff --git a/src/dilithium/reduce.h b/src/dilithium/reduce.h
deleted file mode 100644
index 02df550..0000000
--- a/src/dilithium/reduce.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef REDUCE_H
-#define REDUCE_H
-
-#include <stdint.h>
-#include "params.h"
-
-#define MONT -4186625 // 2^32 % Q
-#define QINV 58728449 // q^(-1) mod 2^32
-
-#define montgomery_reduce DILITHIUM_NAMESPACE(montgomery_reduce)
-/*************************************************
-* Name:        montgomery_reduce
-*
-* Description: For finite field element a with -2^{31}Q <= a <= Q*2^31,
-*              compute r \equiv a*2^{-32} (mod Q) such that -Q < r < Q.
-*
-* Arguments:   - int64_t: finite field element a
-*
-* Returns r.
-**************************************************/
-static inline int32_t montgomery_reduce(int64_t a) {
-  int32_t t;
-
-  t = (int64_t)(int32_t)a*QINV;
-  t = (a - (int64_t)t*Q) >> 32;
-  return t;
-}
-
-#endif
diff --git a/src/dilithium/rounding.c b/src/dilithium/rounding.c
deleted file mode 100644
index 889f0a2..0000000
--- a/src/dilithium/rounding.c
+++ /dev/null
@@ -1,102 +0,0 @@
-#include <stdint.h>
-#include "params.h"
-#include "rounding.h"
-
-/*************************************************
-* Name:        power2round
-*
-* Description: For finite field element a, compute a0, a1 such that
-*              a mod^+ Q = a1*2^D + a0 with -2^{D-1} < a0 <= 2^{D-1}.
-*              Assumes a to be standard representative.
-*
-* Arguments:   - int32_t a: input element
-*              - int32_t *a0: pointer to output element a0
-*
-* Returns a1.
-**************************************************/
-int32_t power2round(int32_t *a0, int32_t a)  {
-  int32_t a1;
-
-  a1 = (a + (1 << (D-1)) - 1) >> D;
-  *a0 = a - (a1 << D);
-  return a1;
-}
-
-/*************************************************
-* Name:        decompose
-*
-* Description: For finite field element a, compute high and low bits a0, a1 such
-*              that a mod^+ Q = a1*ALPHA + a0 with -ALPHA/2 < a0 <= ALPHA/2 except
-*              if a1 = (Q-1)/ALPHA where we set a1 = 0 and
-*              -ALPHA/2 <= a0 = a mod^+ Q - Q < 0. Assumes a to be standard
-*              representative.
-*
-* Arguments:   - int32_t a: input element
-*              - int32_t *a0: pointer to output element a0
-*
-* Returns a1.
-**************************************************/
-int32_t decompose(int32_t *a0, int32_t a) {
-  int32_t a1;
-
-  a1  = (a + 127) >> 7;
-#if GAMMA2 == (Q-1)/32
-  a1  = (a1*1025 + (1 << 21)) >> 22;
-  a1 &= 15;
-#elif GAMMA2 == (Q-1)/88
-  a1  = (a1*11275 + (1 << 23)) >> 24;
-  a1 ^= ((43 - a1) >> 31) & a1;
-#endif
-
-  *a0  = a - a1*2*GAMMA2;
-  *a0 -= (((Q-1)/2 - *a0) >> 31) & Q;
-  return a1;
-}
-
-/*************************************************
-* Name:        make_hint
-*
-* Description: Compute hint bit indicating whether the low bits of the
-*              input element overflow into the high bits.
-*
-* Arguments:   - int32_t a0: low bits of input element
-*              - int32_t a1: high bits of input element
-*
-* Returns 1 if overflow.
-**************************************************/
-unsigned int make_hint(int32_t a0, int32_t a1) {
-  if(a0 > GAMMA2 || a0 < -GAMMA2 || (a0 == -GAMMA2 && a1 != 0))
-    return 1;
-
-  return 0;
-}
-
-/*************************************************
-* Name:        use_hint
-*
-* Description: Correct high bits according to hint.
-*
-* Arguments:   - int32_t a: input element
-*              - unsigned int hint: hint bit
-*
-* Returns corrected high bits.
-**************************************************/
-int32_t use_hint(int32_t a, unsigned int hint) {
-  int32_t a0, a1;
-
-  a1 = decompose(&a0, a);
-  if(hint == 0)
-    return a1;
-
-#if GAMMA2 == (Q-1)/32
-  if(a0 > 0)
-    return (a1 + 1) & 15;
-  else
-    return (a1 - 1) & 15;
-#elif GAMMA2 == (Q-1)/88
-  if(a0 > 0)
-    return (a1 == 43) ?  0 : a1 + 1;
-  else
-    return (a1 ==  0) ? 43 : a1 - 1;
-#endif
-}
diff --git a/src/dilithium/rounding.h b/src/dilithium/rounding.h
deleted file mode 100644
index b72e8e8..0000000
--- a/src/dilithium/rounding.h
+++ /dev/null
@@ -1,19 +0,0 @@
-#ifndef ROUNDING_H
-#define ROUNDING_H
-
-#include <stdint.h>
-#include "params.h"
-
-#define power2round DILITHIUM_NAMESPACE(power2round)
-int32_t power2round(int32_t *a0, int32_t a);
-
-#define decompose DILITHIUM_NAMESPACE(decompose)
-int32_t decompose(int32_t *a0, int32_t a);
-
-#define make_hint DILITHIUM_NAMESPACE(make_hint)
-unsigned int make_hint(int32_t a0, int32_t a1);
-
-#define use_hint DILITHIUM_NAMESPACE(use_hint)
-int32_t use_hint(int32_t a, unsigned int hint);
-
-#endif
diff --git a/src/dilithium/sign.c b/src/dilithium/sign.c
deleted file mode 100644
index 7482654..0000000
--- a/src/dilithium/sign.c
+++ /dev/null
@@ -1,334 +0,0 @@
-#include "../rng.h"
-
-#include <stdint.h>
-#include "params.h"
-#include "sign.h"
-#include "packing.h"
-#include "polyvec.h"
-#include "poly.h"
-#include "symmetric.h"
-
-/*************************************************
-* Name:        crypto_sign_keypair
-*
-* Description: Generates public and private key.
-*
-* Arguments:   - uint8_t *pk: pointer to output public key (allocated
-*                             array of CRYPTO_PUBLICKEYBYTES bytes)
-*              - uint8_t *sk: pointer to output private key (allocated
-*                             array of CRYPTO_SECRETKEYBYTES bytes)
-*
-* Returns 0 (success)
-**************************************************/
-int crypto_sign_keypair(uint8_t *pk, uint8_t *sk) {
-  uint8_t seedbuf[3*SEEDBYTES];
-  uint8_t tr[CRHBYTES];
-  const uint8_t *rho, *rhoprime, *key;
-  polyvecl mat[K];
-  polyvecl s1, s1hat;
-  polyveck s2, t1, t0;
-
-  /* Get randomness for rho, rhoprime and key */
-  randombytes(seedbuf, SEEDBYTES);
-  shake256(seedbuf, 3*SEEDBYTES, seedbuf, SEEDBYTES);
-  rho = seedbuf;
-  rhoprime = seedbuf + SEEDBYTES;
-  key = seedbuf + 2*SEEDBYTES;
-
-  /* Expand matrix */
-  polyvec_matrix_expand(mat, rho);
-
-  /* Sample short vectors s1 and s2 */
-  polyvecl_uniform_eta(&s1, rhoprime, 0);
-  polyveck_uniform_eta(&s2, rhoprime, L);
-
-  /* Matrix-vector multiplication */
-  s1hat = s1;
-  polyvecl_ntt(&s1hat);
-  polyvec_matrix_pointwise_montgomery(&t1, mat, &s1hat);
-  polyveck_reduce(&t1);
-  polyveck_invntt_tomont(&t1);
-
-  /* Add error vector s2 */
-  polyveck_add(&t1, &t1, &s2);
-
-  /* Extract t1 and write public key */
-  polyveck_caddq(&t1);
-  polyveck_power2round(&t1, &t0, &t1);
-  pack_pk(pk, rho, &t1);
-
-  /* Compute CRH(rho, t1) and write secret key */
-  crh(tr, pk, CRYPTO_PUBLICKEYBYTES);
-  pack_sk(sk, rho, tr, key, &t0, &s1, &s2);
-
-  return 0;
-}
-
-/*************************************************
-* Name:        crypto_sign_signature
-*
-* Description: Computes signature.
-*
-* Arguments:   - uint8_t *sig:   pointer to output signature (of length CRYPTO_BYTES)
-*              - size_t *siglen: pointer to output length of signature
-*              - uint8_t *m:     pointer to message to be signed
-*              - size_t mlen:    length of message
-*              - uint8_t *sk:    pointer to bit-packed secret key
-*
-* Returns 0 (success)
-**************************************************/
-int crypto_sign_signature(uint8_t *sig,
-                          size_t *siglen,
-                          const uint8_t *m,
-                          size_t mlen,
-                          const uint8_t *sk)
-{
-  unsigned int n;
-  uint8_t seedbuf[2*SEEDBYTES + 3*CRHBYTES];
-  uint8_t *rho, *tr, *key, *mu, *rhoprime;
-  uint16_t nonce = 0;
-  polyvecl mat[K], s1, y, z;
-  polyveck t0, s2, w1, w0, h;
-  poly cp;
-  shake256incctx state;
-
-  rho = seedbuf;
-  tr = rho + SEEDBYTES;
-  key = tr + CRHBYTES;
-  mu = key + SEEDBYTES;
-  rhoprime = mu + CRHBYTES;
-  unpack_sk(rho, tr, key, &t0, &s1, &s2, sk);
-
-  /* Compute CRH(tr, msg) */
-  shake256_inc_init(&state);
-  shake256_inc_absorb(&state, tr, CRHBYTES);
-  shake256_inc_absorb(&state, m, mlen);
-  shake256_inc_finalize(&state);
-  shake256_inc_squeeze(mu, CRHBYTES, &state);
-
-#ifdef DILITHIUM_RANDOMIZED_SIGNING
-  randombytes(rhoprime, CRHBYTES);
-#else
-  crh(rhoprime, key, SEEDBYTES + CRHBYTES);
-#endif
-
-  /* Expand matrix and transform vectors */
-  polyvec_matrix_expand(mat, rho);
-  polyvecl_ntt(&s1);
-  polyveck_ntt(&s2);
-  polyveck_ntt(&t0);
-
-rej:
-  /* Sample intermediate vector y */
-  polyvecl_uniform_gamma1(&y, rhoprime, nonce++);
-
-  /* Matrix-vector multiplication */
-  z = y;
-  polyvecl_ntt(&z);
-  polyvec_matrix_pointwise_montgomery(&w1, mat, &z);
-  polyveck_reduce(&w1);
-  polyveck_invntt_tomont(&w1);
-
-  /* Decompose w and call the random oracle */
-  polyveck_caddq(&w1);
-  polyveck_decompose(&w1, &w0, &w1);
-  polyveck_pack_w1(sig, &w1);
-
-  shake256_inc_init(&state);
-  shake256_inc_absorb(&state, mu, CRHBYTES);
-  shake256_inc_absorb(&state, sig, K*POLYW1_PACKEDBYTES);
-  shake256_inc_finalize(&state);
-  shake256_inc_squeeze(sig, SEEDBYTES, &state);
-  poly_challenge(&cp, sig);
-  poly_ntt(&cp);
-
-  /* Compute z, reject if it reveals secret */
-  polyvecl_pointwise_poly_montgomery(&z, &cp, &s1);
-  polyvecl_invntt_tomont(&z);
-  polyvecl_add(&z, &z, &y);
-  polyvecl_reduce(&z);
-  if(polyvecl_chknorm(&z, GAMMA1 - BETA))
-    goto rej;
-
-  /* Check that subtracting cs2 does not change high bits of w and low bits
-   * do not reveal secret information */
-  polyveck_pointwise_poly_montgomery(&h, &cp, &s2);
-  polyveck_invntt_tomont(&h);
-  polyveck_sub(&w0, &w0, &h);
-  polyveck_reduce(&w0);
-  if(polyveck_chknorm(&w0, GAMMA2 - BETA))
-    goto rej;
-
-  /* Compute hints for w1 */
-  polyveck_pointwise_poly_montgomery(&h, &cp, &t0);
-  polyveck_invntt_tomont(&h);
-  polyveck_reduce(&h);
-  if(polyveck_chknorm(&h, GAMMA2))
-    goto rej;
-
-  polyveck_add(&w0, &w0, &h);
-  n = polyveck_make_hint(&h, &w0, &w1);
-  if(n > OMEGA)
-    goto rej;
-
-  /* Write signature */
-  pack_sig(sig, sig, &z, &h);
-  *siglen = CRYPTO_BYTES;
-  return 0;
-}
-
-/*************************************************
-* Name:        crypto_sign
-*
-* Description: Compute signed message.
-*
-* Arguments:   - uint8_t *sm: pointer to output signed message (allocated
-*                             array with CRYPTO_BYTES + mlen bytes),
-*                             can be equal to m
-*              - size_t *smlen: pointer to output length of signed
-*                               message
-*              - const uint8_t *m: pointer to message to be signed
-*              - size_t mlen: length of message
-*              - const uint8_t *sk: pointer to bit-packed secret key
-*
-* Returns 0 (success)
-**************************************************/
-int crypto_sign(uint8_t *sm,
-                size_t *smlen,
-                const uint8_t *m,
-                size_t mlen,
-                const uint8_t *sk)
-{
-  size_t i;
-
-  for(i = 0; i < mlen; ++i)
-    sm[CRYPTO_BYTES + mlen - 1 - i] = m[mlen - 1 - i];
-  crypto_sign_signature(sm, smlen, sm + CRYPTO_BYTES, mlen, sk);
-  *smlen += mlen;
-  return 0;
-}
-
-/*************************************************
-* Name:        crypto_sign_verify
-*
-* Description: Verifies signature.
-*
-* Arguments:   - uint8_t *m: pointer to input signature
-*              - size_t siglen: length of signature
-*              - const uint8_t *m: pointer to message
-*              - size_t mlen: length of message
-*              - const uint8_t *pk: pointer to bit-packed public key
-*
-* Returns 0 if signature could be verified correctly and -1 otherwise
-**************************************************/
-int crypto_sign_verify(const uint8_t *sig,
-                       size_t siglen,
-                       const uint8_t *m,
-                       size_t mlen,
-                       const uint8_t *pk)
-{
-  unsigned int i;
-  uint8_t buf[K*POLYW1_PACKEDBYTES];
-  uint8_t rho[SEEDBYTES];
-  uint8_t mu[CRHBYTES];
-  uint8_t c[SEEDBYTES];
-  uint8_t c2[SEEDBYTES];
-  poly cp;
-  polyvecl mat[K], z;
-  polyveck t1, w1, h;
-  shake256incctx state;
-
-  if(siglen != CRYPTO_BYTES)
-    return -1;
-
-  unpack_pk(rho, &t1, pk);
-  if(unpack_sig(c, &z, &h, sig))
-    return -1;
-  if(polyvecl_chknorm(&z, GAMMA1 - BETA))
-    return -1;
-
-  /* Compute CRH(CRH(rho, t1), msg) */
-  crh(mu, pk, CRYPTO_PUBLICKEYBYTES);
-  shake256_inc_init(&state);
-  shake256_inc_absorb(&state, mu, CRHBYTES);
-  shake256_inc_absorb(&state, m, mlen);
-  shake256_inc_finalize(&state);
-  shake256_inc_squeeze(mu, CRHBYTES, &state);
-
-  /* Matrix-vector multiplication; compute Az - c2^dt1 */
-  poly_challenge(&cp, c);
-  polyvec_matrix_expand(mat, rho);
-
-  polyvecl_ntt(&z);
-  polyvec_matrix_pointwise_montgomery(&w1, mat, &z);
-
-  poly_ntt(&cp);
-  polyveck_shiftl(&t1);
-  polyveck_ntt(&t1);
-  polyveck_pointwise_poly_montgomery(&t1, &cp, &t1);
-
-  polyveck_sub(&w1, &w1, &t1);
-  polyveck_reduce(&w1);
-  polyveck_invntt_tomont(&w1);
-
-  /* Reconstruct w1 */
-  polyveck_caddq(&w1);
-  polyveck_use_hint(&w1, &w1, &h);
-  polyveck_pack_w1(buf, &w1);
-
-  /* Call random oracle and verify challenge */
-  shake256_inc_init(&state);
-  shake256_inc_absorb(&state, mu, CRHBYTES);
-  shake256_inc_absorb(&state, buf, K*POLYW1_PACKEDBYTES);
-  shake256_inc_finalize(&state);
-  shake256_inc_squeeze(c2, SEEDBYTES, &state);
-  for(i = 0; i < SEEDBYTES; ++i)
-    if(c[i] != c2[i])
-      return -1;
-
-  return 0;
-}
-
-/*************************************************
-* Name:        crypto_sign_open
-*
-* Description: Verify signed message.
-*
-* Arguments:   - uint8_t *m: pointer to output message (allocated
-*                            array with smlen bytes), can be equal to sm
-*              - size_t *mlen: pointer to output length of message
-*              - const uint8_t *sm: pointer to signed message
-*              - size_t smlen: length of signed message
-*              - const uint8_t *pk: pointer to bit-packed public key
-*
-* Returns 0 if signed message could be verified correctly and -1 otherwise
-**************************************************/
-int crypto_sign_open(uint8_t *m,
-                     size_t *mlen,
-                     const uint8_t *sm,
-                     size_t smlen,
-                     const uint8_t *pk)
-{
-  size_t i;
-
-  if(smlen < CRYPTO_BYTES)
-    goto badsig;
-
-  *mlen = smlen - CRYPTO_BYTES;
-  if(crypto_sign_verify(sm, CRYPTO_BYTES, sm + CRYPTO_BYTES, *mlen, pk))
-    goto badsig;
-  else {
-    /* All good, copy msg, return 0 */
-    for(i = 0; i < *mlen; ++i)
-      m[i] = sm[CRYPTO_BYTES + i];
-    return 0;
-  }
-
-badsig:
-  /* Signature verification failed */
-  *mlen = -1;
-  for(i = 0; i < smlen; ++i)
-    m[i] = 0;
-
-  return -1;
-}
diff --git a/src/dilithium/sign.h b/src/dilithium/sign.h
deleted file mode 100644
index 42240b3..0000000
--- a/src/dilithium/sign.h
+++ /dev/null
@@ -1,37 +0,0 @@
-#ifndef SIGN_H
-#define SIGN_H
-
-#include <stddef.h>
-#include <stdint.h>
-#include "params.h"
-#include "api.h"
-#include "polyvec.h"
-#include "poly.h"
-
-#define challenge DILITHIUM_NAMESPACE(challenge)
-void challenge(poly *c, const uint8_t seed[SEEDBYTES]);
-
-// #define crypto_sign_keypair DILITHIUM_NAMESPACE(crypto_sign_keypair)
-// int crypto_sign_keypair(uint8_t *pk, uint8_t *sk);
-
-// #define crypto_sign_signature DILITHIUM_NAMESPACE(signature)
-// int crypto_sign_signature(uint8_t *sig, size_t *siglen,
-//                           const uint8_t *m, size_t mlen,
-//                           const uint8_t *sk);
-
-// #define crypto_sign DILITHIUM_NAMESPACE(crypto_sign)
-// int crypto_sign(uint8_t *sm, size_t *smlen,
-//                 const uint8_t *m, size_t mlen,
-//                 const uint8_t *sk);
-
-// #define crypto_sign_verify DILITHIUM_NAMESPACE(verify)
-// int crypto_sign_verify(const uint8_t *sig, size_t siglen,
-//                        const uint8_t *m, size_t mlen,
-//                        const uint8_t *pk);
-
-// #define crypto_sign_open DILITHIUM_NAMESPACE(crypto_sign_open)
-// int crypto_sign_open(uint8_t *m, size_t *mlen,
-//                      const uint8_t *sm, size_t smlen,
-//                      const uint8_t *pk);
-
-#endif
diff --git a/src/dilithium/smallntt.S b/src/dilithium/smallntt.S
deleted file mode 100644
index 747c111..0000000
--- a/src/dilithium/smallntt.S
+++ /dev/null
@@ -1,837 +0,0 @@
-#include "macros.i"
-
-.syntax unified
-.cpu cortex-m4
-.thumb
-
-// general macros
-.macro load a, a0, a1, a2, a3, mem0, mem1, mem2, mem3
-  ldr.w \a0, [\a, \mem0]
-  ldr.w \a1, [\a, \mem1]
-  ldr.w \a2, [\a, \mem2]
-  ldr.w \a3, [\a, \mem3]
-.endm
-
-.macro store a, a0, a1, a2, a3, mem0, mem1, mem2, mem3
-  str.w \a0, [\a, \mem0]
-  str.w \a1, [\a, \mem1]
-  str.w \a2, [\a, \mem2]
-  str.w \a3, [\a, \mem3]
-.endm
-
-.macro montgomery q, qinv, a, tmp
-  smulbt \tmp, \a, \qinv
-  smlabb \tmp, \q, \tmp, \a
-.endm
-
-.macro montgomery_inplace q, qinv, a, tmp
-  smulbt \tmp, \a, \qinv
-  smlabb \a, \q, \tmp, \a
-.endm
-
-.macro doublemontgomery a, tmp, tmp2, q, qinv, montconst
-  smulbb \tmp2, \a, \montconst
-  montgomery \q, \qinv, \tmp2, \tmp
-  smultb \a, \a, \montconst
-  montgomery \q, \qinv, \a, \tmp2
-  pkhtb \a, \tmp2, \tmp, asr#16
-.endm
-
-// #######
-// #######
-// # NTT #
-// #######
-// #######
-
-.macro mul_twiddle tb, a, twiddle, tmp, tmp2, q, qinv
-    smulb\tb \tmp, \a, \twiddle
-    smult\tb \a, \a, \twiddle
-    montgomery \q, \qinv, \tmp, \tmp2 // reduce -> result in tmp2
-    montgomery \q, \qinv, \a, \tmp // reduce -> result in tmp2
-    pkhtb \a, \tmp, \tmp2, asr#16 // combine results from above in one register as 16bit halves
-.endm
-
-.macro doublebutterfly tb, a0, a1, twiddle, tmp, tmp2, q, qinv
-  smulb\tb \tmp, \a1, \twiddle // a1_b * twiddle_tb
-  smult\tb \a1, \a1, \twiddle // a1_t * twiddle_tb
-  montgomery \q, \qinv, \tmp, \tmp2 // reduce -> result in tmp2
-  montgomery \q, \qinv, \a1, \tmp // reduce -> result in tmp
-  pkhtb \tmp2, \tmp, \tmp2, asr#16 // combine results from above in one register as 16bit halves
-  usub16 \a1, \a0, \tmp2 // a0 - a1 * twiddle (a0, a1 contain 2 coeffs)
-  uadd16 \a0, \a0, \tmp2 // a0 + a1 * twiddle (a0, a1 contain 2 coeffs)
-.endm
-
-.macro two_doublebutterfly tb1, tb2, a0, a1, a2, a3, twiddle, tmp, tmp2, q, qinv
-  doublebutterfly \tb1, \a0, \a1, \twiddle, \tmp, \tmp2, \q, \qinv
-  doublebutterfly \tb2, \a2, \a3, \twiddle, \tmp, \tmp2, \q, \qinv
-.endm
-
-.macro _3_layer_double_CT_16 c0, c1, c2, c3, c4, c5, c6, c7, twiddle, twiddle_ptr, Qprime, Q, tmp, tmp2
-    // layer 3
-    ldrh.w \twiddle, [\twiddle_ptr], #2
-    two_doublebutterfly b, b, \c0, \c4, \c1, \c5, \twiddle, \tmp, \tmp2, \Q, \Qprime
-    two_doublebutterfly b, b, \c2, \c6, \c3, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime
-
-    // layer 2
-    ldr.w \twiddle, [\twiddle_ptr], #4
-    two_doublebutterfly b, b, \c0, \c2, \c1, \c3, \twiddle, \tmp, \tmp2, \Q, \Qprime
-
-    two_doublebutterfly t, t, \c4, \c6, \c5, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime
-
-    // layer 1
-    ldr.w \twiddle, [\twiddle_ptr], #4
-    two_doublebutterfly b, t, \c0, \c1, \c2, \c3, \twiddle, \tmp, \tmp2, \Q, \Qprime
-
-    ldr.w \twiddle, [\twiddle_ptr], #4
-    two_doublebutterfly b, t, \c4, \c5, \c6, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime
-.endm
-
-.macro _3_layer_double_CT_16_fp c0, c1, c2, c3, c4, c5, c6, c7, xi01, xi23, xi45, xi67, twiddle, Qprime, Q, tmp, tmp2
-    // layer 3
-    vmov \twiddle, \xi01
-    two_doublebutterfly t, t, \c0, \c4, \c1, \c5, \twiddle, \tmp, \tmp2, \Q, \Qprime
-    two_doublebutterfly t, t, \c2, \c6, \c3, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime
-
-    // layer 2
-    vmov \twiddle, \xi23
-    two_doublebutterfly b, b, \c0, \c2, \c1, \c3, \twiddle, \tmp, \tmp2, \Q, \Qprime
-
-    two_doublebutterfly t, t, \c4, \c6, \c5, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime
-
-    // layer 1
-    vmov \twiddle, \xi45
-    two_doublebutterfly b, t, \c0, \c1, \c2, \c3, \twiddle, \tmp, \tmp2, \Q, \Qprime
-
-    vmov \twiddle, \xi67
-    two_doublebutterfly b, t, \c4, \c5, \c6, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime
-.endm
-
-.global small_ntt_asm
-.type small_ntt_asm, %function
-.align 2
-small_ntt_asm:
-  push {r4-r11, r14}
-  vpush.w {s16}
-
-  poly        .req r0
-  twiddle_ptr .req r1
-  poly0       .req r2
-  poly1       .req r3
-  poly2       .req r4
-  poly3       .req r5
-  poly4       .req r6
-  poly5       .req r7
-  poly6       .req r8
-  poly7       .req r9
-  twiddle     .req r10
-  qinv        .req r11
-  q           .req r11
-  tmp         .req r12
-  tmp2        .req r14
-
-  movw q, #769
-  movt qinv, #767
-
-  ### LAYER 7+6+5+4
-  .equ distance, 256
-  .equ offset, 32
-  .equ strincr, 4
-  // pre-load twiddle factors to FPU registers
-  vldm twiddle_ptr!, {s8-s15}
-
-
-  add tmp, poly, #strincr*8
-  vmov s16, tmp
-  1:
-    // load a1, a3, ..., a15
-    load poly, poly0, poly1, poly2, poly3, #offset, #distance/4+offset, #2*distance/4+offset, #3*distance/4+offset
-    load poly, poly4, poly5, poly6, poly7, #distance+offset, #5*distance/4+offset, #6*distance/4+offset, #7*distance/4+offset
-
-    // 8-NTT on a1, a3, ..., a15
-    _3_layer_double_CT_16_fp poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, twiddle, qinv, q, tmp, tmp2
-
-    // multiply coeffs by layer 4 twiddles for later use
-    vmov twiddle, s12
-    mul_twiddle b, poly0, twiddle, tmp, tmp2, q, qinv
-    mul_twiddle t, poly1, twiddle, tmp, tmp2, q, qinv
-
-    vmov twiddle, s13
-    mul_twiddle b, poly2, twiddle, tmp, tmp2, q, qinv
-    mul_twiddle t, poly3, twiddle, tmp, tmp2, q, qinv
-
-    vmov twiddle, s14
-    mul_twiddle b, poly4, twiddle, tmp, tmp2, q, qinv
-    mul_twiddle t, poly5, twiddle, tmp, tmp2, q, qinv
-
-    vmov twiddle, s15
-    mul_twiddle b, poly6, twiddle, tmp, tmp2, q, qinv
-    mul_twiddle t, poly7, twiddle, tmp, tmp2, q, qinv
-
-    vmov s0, poly0 // a1
-    vmov s1, poly1 // a3
-    vmov s2, poly2 // a5
-    vmov s3, poly3 // a7
-    vmov s4, poly4 // a9
-    vmov s5, poly5 // a11
-    vmov s6, poly6 // a13
-    vmov s7, poly7 // a15
-
-    // ----------
-
-    // load a0, a2, ..., a14
-    load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
-    load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
-
-    // 8-NTT on a0, a2, ..., a14
-    _3_layer_double_CT_16_fp poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, twiddle, qinv, q, tmp, tmp2
-
-    // layer 4 - 1
-    // addsub: (a2, a6, a10, a14), (a3, a7, a11, a15)
-    vmov tmp2, s1 // load a3
-    vmov s1, poly0 // preserve a0
-    uadd16 poly0, poly1, tmp2
-    usub16 poly1, poly1, tmp2
-
-    vmov tmp2, s3 // load a7
-    vmov s3, poly2 // preserve a4
-    uadd16 poly2, poly3, tmp2
-    usub16 poly3, poly3, tmp2
-
-    vmov tmp2, s5 // load a11
-    vmov s5, poly4 // preserve a8
-    uadd16 poly4, poly5, tmp2
-    usub16 poly5, poly5, tmp2
-
-    vmov tmp2, s7 // load a15
-    vmov s7, poly6 // preserve a12
-    uadd16 poly6, poly7, tmp2
-    usub16 poly7, poly7, tmp2
-
-    str.w poly0, [poly, #1*distance/4]
-    str.w poly1, [poly, #1*distance/4+offset]
-    str.w poly2, [poly, #3*distance/4]
-    str.w poly3, [poly, #3*distance/4+offset]
-    str.w poly4, [poly, #5*distance/4]
-    str.w poly5, [poly, #5*distance/4+offset]
-    str.w poly6, [poly, #7*distance/4]
-    str.w poly7, [poly, #7*distance/4+offset]
-
-    // layer 4 - 2
-    // addsub: (a0, a4, a8, a12), (a1, a5, a9, a13)
-    vmov tmp2, s1 // load a0
-    vmov poly1, s0 // load a1
-    uadd16 poly0, tmp2, poly1
-    usub16 poly1, tmp2, poly1
-
-    vmov tmp2, s3 // load a4
-    vmov poly3, s2 // load a5
-    uadd16 poly2, tmp2, poly3
-    usub16 poly3, tmp2, poly3
-
-    vmov tmp2, s5 // load a8
-    vmov poly5, s4 // load a9
-    uadd16 poly4, tmp2, poly5
-    usub16 poly5, tmp2, poly5
-
-    vmov tmp2, s7 // load a12
-    vmov poly7, s6 // load a13
-    uadd16 poly6, tmp2, poly7
-    usub16 poly7, tmp2, poly7
-
-    str.w poly1, [poly, #offset]
-    str.w poly2, [poly, #2*distance/4]
-    str.w poly3, [poly, #2*distance/4+offset]
-    str.w poly4, [poly, #4*distance/4]
-    str.w poly5, [poly, #4*distance/4+offset]
-    str.w poly6, [poly, #6*distance/4]
-    str.w poly7, [poly, #6*distance/4+offset]
-    str.w poly0, [poly], #4
-
-    vmov tmp, s16
-    cmp.w poly, tmp
-  bne.w 1b
-
-  sub.w poly, #8*strincr
-
-  ### LAYER 3+2+1
-
-  .equ distance, distance/16
-  .equ strincr, 32
-
-  add.w tmp, poly, #strincr*16
-  vmov s13, tmp
-
-  2:
-    load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
-    load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
-
-    _3_layer_double_CT_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, qinv, q, tmp, tmp2
-
-    store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
-    str.w poly1, [poly, #distance/4]
-    str.w poly2, [poly, #2*distance/4]
-    str.w poly3, [poly, #3*distance/4]
-    str.w poly0, [poly], #strincr
-
-    vmov tmp, s13
-    cmp.w poly, tmp
-  bne.w 2b
-
-  vpop.w {s16}
-  pop {r4-r11, pc}
-
-
-.unreq poly
-.unreq twiddle_ptr
-.unreq poly0
-.unreq poly1
-.unreq poly2
-.unreq poly3
-.unreq poly4
-.unreq poly5
-.unreq poly6
-.unreq poly7
-.unreq twiddle
-.unreq qinv
-.unreq q
-.unreq tmp
-.unreq tmp2
-
-// ########
-// ########
-// # INTT #
-// ########
-// ########
-
-.macro doublebutterfly_light a0, a1, tmp, tmp2, q, qinv
-  uadd16 \tmp, \a0, \a1
-  usub16 \a1, \a0, \a1
-  mov.w \a0, \tmp
-.endm
-
-.macro two_doublebutterfly_light a0, a1, a2, a3, tmp, tmp2, q, qinv
-  doublebutterfly_light \a0, \a1, \tmp, \tmp2, \q, \qinv
-  doublebutterfly_light \a2, \a3, \tmp, \tmp2, \q, \qinv
-.endm
-
-.macro _3_layer_double_inv_CT_16_light c0, c1, c2, c3, c4, c5, c6, c7, xi0, xi12, xi34, xi56, twiddle, q, qinv, tmp, tmp2
-
-  // layer 1
-  sadd16.w \tmp, \c0, \c1 // c0, c1
-  ssub16.w \c1, \c0, \c1
-  sadd16.w \tmp2, \c2, \c3 // c2, c3
-  ssub16.w \c3, \c2, \c3
-
-  sadd16.w \c0, \c4, \c5 // c4, c5
-  ssub16.w \c5, \c4, \c5
-  sadd16.w \c2, \c6, \c7 // c6, c7
-  ssub16.w \c7, \c6, \c7
-  // c4, c6 are free at this point
-
-  // layer 2
-  sadd16.w \c6, \tmp, \tmp2 // c0, c2
-  ssub16.w \tmp2, \tmp, \tmp2
-  sadd16.w \c4, \c0, \c2 // c4, c6
-  ssub16.w \c2, \c0, \c2
-
-  vmov.w \twiddle, \xi12
-  doublebutterfly t, \c1, \c3, \twiddle, \tmp, \c0, \q, \qinv // c0 has been used and c6 still free
-  doublebutterfly t, \c5, \c7, \twiddle, \tmp, \c0, \q, \qinv
-  // c0, c6 are free at this point
-
-  // layer 3
-  sadd16.w \c0, \c6, \c4 // c0, c4
-  ssub16.w \c4, \c6, \c4
-
-  vmov.w \twiddle, \xi34
-  doublebutterfly t, \c1, \c5, \twiddle, \tmp, \c6, \q, \qinv
-
-  vmov.w \twiddle, \xi56
-  // this block is one doublebutterfly
-  smulbb \tmp, \c2, \twiddle // c2, c6
-  smultb \c2, \c2, \twiddle
-  montgomery_inplace \q, \qinv, \tmp, \c6
-  montgomery_inplace \q, \qinv, \c2, \c6
-  pkhtb \tmp, \c2, \tmp, asr #16
-  ssub16.w \c6, \tmp2, \tmp
-  sadd16.w \c2, \tmp2, \tmp
-
-  doublebutterfly t, \c3, \c7, \twiddle, \tmp, \tmp2, \q, \qinv
-
-.endm
-
-.macro _3_layer_double_inv_CT_16_light_reduce c0, c1, c2, c3, c4, c5, c6, c7, xi0, xi12, xi34, xi56, twiddle, q, qinv, tmp, tmp2
-
-  // layer 1
-  sadd16.w \tmp, \c0, \c1 // c0, c1
-  ssub16.w \c1, \c0, \c1
-  sadd16.w \tmp2, \c2, \c3 // c2, c3
-  ssub16.w \c3, \c2, \c3
-
-  sadd16.w \c0, \c4, \c5 // c4, c5
-  ssub16.w \c5, \c4, \c5
-  sadd16.w \c2, \c6, \c7 // c6, c7
-  ssub16.w \c7, \c6, \c7
-  // c4, c6 are free at this point
-
-  mov.w \c6, \tmp
-  mov.w \c4, \c0
-
-  // layer 2
-  vmov.w \twiddle, \xi12
-  doublebutterfly b, \c6, \tmp2, \twiddle, \tmp, \c0, \q, \qinv
-  doublebutterfly b, \c4, \c2, \twiddle, \tmp, \c0, \q, \qinv
-  doublebutterfly t, \c1, \c3, \twiddle, \tmp, \c0, \q, \qinv // c0 has been used and c6 still free
-  doublebutterfly t, \c5, \c7, \twiddle, \tmp, \c0, \q, \qinv
-  // c0, c6 are free at this point
-
-  // layer 3
-  sadd16.w \c0, \c6, \c4 // c0, c4
-  ssub16.w \c4, \c6, \c4
-
-  vmov.w \twiddle, \xi34
-  doublebutterfly t, \c1, \c5, \twiddle, \tmp, \c6, \q, \qinv
-
-  vmov.w \twiddle, \xi56
-  // this block is one doublebutterfly
-  smulbb \tmp, \c2, \twiddle // c2, c6
-  smultb \c2, \c2, \twiddle
-  montgomery_inplace \q, \qinv, \tmp, \c6
-  montgomery_inplace \q, \qinv, \c2, \c6
-  pkhtb \tmp, \c2, \tmp, asr #16
-  ssub16.w \c6, \tmp2, \tmp
-  sadd16.w \c2, \tmp2, \tmp
-
-  doublebutterfly t, \c3, \c7, \twiddle, \tmp, \tmp2, \q, \qinv
-
-.endm
-
-.macro _3_layer_double_inv_CT_16 c0, c1, c2, c3, c4, c5, c6, c7, twiddle, twiddle_ptr, Qprime, Q, tmp, tmp2
-    // layer 3
-    ldrh.w twiddle, [twiddle_ptr], #2
-    two_doublebutterfly b, b, \c0, \c1, \c2, \c3, \twiddle, \tmp, \tmp2, \Q, \Qprime
-    two_doublebutterfly b, b, \c4, \c5, \c6, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime
-
-    // layer 2
-    ldr.w twiddle, [twiddle_ptr], #4
-    two_doublebutterfly b, t, \c0, \c2, \c1, \c3, \twiddle, \tmp, \tmp2, \Q, \Qprime
-
-    two_doublebutterfly b, t, \c4, \c6, \c5, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime
-
-    // layer 1
-    ldr.w twiddle, [twiddle_ptr], #4
-    two_doublebutterfly b, t, \c0, \c4, \c1, \c5, \twiddle, \tmp, \tmp2, \Q, \Qprime
-
-    ldr.w twiddle, [twiddle_ptr], #4
-    two_doublebutterfly b, t, \c2, \c6, \c3, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime
-.endm
-
-.macro mul_twiddle_barrett_32 tb a, twiddle, Qbar, Q, tmp, tmp2
-    smulb\tb \tmp, \a, \twiddle
-    smmulr.w \tmp2, \tmp, \Qbar
-    mls.w \tmp, \tmp2, \Q, \tmp
-    smult\tb \a, \a, \twiddle
-    smmulr.w \tmp2, \a, \Qbar
-    mls.w \a, \tmp2, \Q, \a
-    pkhbt \a, \tmp, \a, lsl #16
-.endm
-
-.macro _3_layer_double_inv_twist_16 c0, c1, c2, c3, c4, c5, c6, c7, twiddle, twiddle_ptr, Qbar, Q, tmp, tmp2
-
-    movt \Q, #0
-
-    ldr.w \twiddle, [\twiddle_ptr], #4
-
-    mul_twiddle_barrett_32 b, \c0, \twiddle, \Qbar, \Q, \tmp, \tmp2
-    mul_twiddle_barrett_32 t, \c1, \twiddle, \Qbar, \Q, \tmp, \tmp2
-
-    ldr.w \twiddle, [\twiddle_ptr], #4
-
-    mul_twiddle_barrett_32 b, \c2, \twiddle, \Qbar, \Q, \tmp, \tmp2
-    mul_twiddle_barrett_32 t, \c3, \twiddle, \Qbar, \Q, \tmp, \tmp2
-
-    ldr.w \twiddle, [\twiddle_ptr], #4
-
-    mul_twiddle_barrett_32 b, \c4, \twiddle, \Qbar, \Q, \tmp, \tmp2
-    mul_twiddle_barrett_32 t, \c5, \twiddle, \Qbar, \Q, \tmp, \tmp2
-
-    ldr.w \twiddle, [\twiddle_ptr], #4
-
-    mul_twiddle_barrett_32 b, \c6, \twiddle, \Qbar, \Q, \tmp, \tmp2
-    mul_twiddle_barrett_32 t, \c7, \twiddle, \Qbar, \Q, \tmp, \tmp2
-
-    movt \Q, #767
-
-.endm
-
-.global small_invntt_tomont_asm
-.type small_invntt_tomont_asm, %function
-.align 2
-small_invntt_tomont_asm:
-  push {r4-r11, r14}
-
-  poly        .req r0
-  twiddle_ptr .req r1
-  poly0       .req r2
-  poly1       .req r3
-  poly2       .req r4
-  poly3       .req r5
-  poly4       .req r6
-  poly5       .req r7
-  poly6       .req r8
-  poly7       .req r9
-  twiddle     .req r10
-  qinv        .req r11
-  q           .req r11
-  tmp         .req r12
-  tmp2        .req r14
-
-  movw q, #769
-  movt qinv, #767
-
-  ### LAYER 7+6+5+4
-  .equ distance, 16
-  .equ offset, 32
-  .equ strincr, 64
-
-  // pre-load twiddle factors to FPU registers
-  vldm twiddle_ptr!, {s8-s15}
-
-  add.w tmp, poly, #8*strincr
-  vmov s8, tmp
-  1:
-    // load a1, a3, ..., a15
-    load poly, poly0, poly1, poly2, poly3, #offset, #distance/4+offset, #2*distance/4+offset, #3*distance/4+offset
-    load poly, poly4, poly5, poly6, poly7, #distance+offset, #5*distance/4+offset, #6*distance/4+offset, #7*distance/4+offset
-
-    // NTT on a1, a3, ..., a15
-    _3_layer_double_inv_CT_16_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, twiddle, q, qinv, tmp, tmp2
-
-    // multiply coeffs by layer 4 twiddles for later use
-    vmov twiddle, s12
-    mul_twiddle b, poly0, twiddle, tmp, tmp2, q, qinv // could be omitted but kept for reduction only
-    mul_twiddle t, poly1, twiddle, tmp, tmp2, q, qinv
-
-    vmov twiddle, s13
-    mul_twiddle b, poly2, twiddle, tmp, tmp2, q, qinv
-    mul_twiddle t, poly3, twiddle, tmp, tmp2, q, qinv
-
-    vmov twiddle, s14
-    mul_twiddle b, poly4, twiddle, tmp, tmp2, q, qinv
-    mul_twiddle t, poly5, twiddle, tmp, tmp2, q, qinv
-
-    vmov twiddle, s15
-    mul_twiddle b, poly6, twiddle, tmp, tmp2, q, qinv
-    mul_twiddle t, poly7, twiddle, tmp, tmp2, q, qinv
-
-    vmov s0, poly0 // a1
-    vmov s1, poly1 // a3
-    vmov s2, poly2 // a5
-    vmov s3, poly3 // a7
-    vmov s4, poly4 // a9
-    vmov s5, poly5 // a11
-    vmov s6, poly6 // a13
-    vmov s7, poly7 // a15
-
-    // ----------
-
-    // load a0, a2, ..., a14
-    load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
-    load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
-
-    // NTT on a0, a2, ..., a14
-    _3_layer_double_inv_CT_16_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, twiddle, q, qinv, tmp, tmp2
-
-    // layer 4 - 1
-    // addsub: (a2, a6, a10, a14), (a3, a7, a11, a15)
-    vmov tmp2, s1 // load a3
-    vmov s1, poly0 // preserve a0
-    uadd16 poly0, poly1, tmp2
-    usub16 poly1, poly1, tmp2
-
-    vmov tmp2, s3 // load a7
-    vmov s3, poly2 // preserve a4
-    uadd16 poly2, poly3, tmp2
-    usub16 poly3, poly3, tmp2
-
-    vmov tmp2, s5 // load a11
-    vmov s5, poly4 // preserve a8
-    uadd16 poly4, poly5, tmp2
-    usub16 poly5, poly5, tmp2
-
-    vmov tmp2, s7 // load a15
-    vmov s7, poly6 // preserve a12
-    uadd16 poly6, poly7, tmp2
-    usub16 poly7, poly7, tmp2
-
-    str.w poly0, [poly, #1*distance/4]
-    str.w poly1, [poly, #1*distance/4+offset]
-    str.w poly2, [poly, #3*distance/4]
-    str.w poly3, [poly, #3*distance/4+offset]
-    str.w poly4, [poly, #5*distance/4]
-    str.w poly5, [poly, #5*distance/4+offset]
-    str.w poly6, [poly, #7*distance/4]
-    str.w poly7, [poly, #7*distance/4+offset]
-
-    // layer 4 - 2
-    // addsub: (a0, a4, a8, a12), (a1, a5, a9, a13)
-    vmov tmp2, s1 // load a0
-    vmov poly1, s0 // load a1
-    uadd16 poly0, tmp2, poly1
-    usub16 poly1, tmp2, poly1
-
-    vmov tmp2, s3 // load a4
-    vmov poly3, s2 // load a5
-    uadd16 poly2, tmp2, poly3
-    usub16 poly3, tmp2, poly3
-
-    vmov tmp2, s5 // load a8
-    vmov poly5, s4 // load a9
-    uadd16 poly4, tmp2, poly5
-    usub16 poly5, tmp2, poly5
-
-    vmov tmp2, s7 // load a12
-    vmov poly7, s6 // load a13
-    uadd16 poly6, tmp2, poly7
-    usub16 poly7, tmp2, poly7
-
-    str.w poly1, [poly, #offset]
-    str.w poly2, [poly, #2*distance/4]
-    str.w poly3, [poly, #2*distance/4+offset]
-    str.w poly4, [poly, #4*distance/4]
-    str.w poly5, [poly, #4*distance/4+offset]
-    str.w poly6, [poly, #6*distance/4]
-    str.w poly7, [poly, #6*distance/4+offset]
-    str.w poly0, [poly], #strincr // increase 2*8*4 = 64 (2 * 8 loads of 4 bytes each)
-
-    vmov tmp, s8
-    cmp.w poly, tmp
-  bne.w 1b
-
-  sub.w poly, #8*strincr
-
-  ### LAYER 3+2+1
-  .equ distance, distance*16
-  .equ strincr, 4
-
-  // ITER 0
-  load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
-  load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
-
-  vldm twiddle_ptr!, {s5-s7}
-
-  _3_layer_double_inv_CT_16_light_reduce poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s5, s5, s6, s7, twiddle, q, qinv, tmp, tmp2
-
-  vmov.w s2, poly
-  movw poly, #:lower16:5585133
-  movt poly, #:upper16:5585133
-
-  // twisting
-  _3_layer_double_inv_twist_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, poly, q, tmp, tmp2
-
-  vmov.w poly, s2
-
-  store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
-  str.w poly1, [poly, #distance/4]
-  str.w poly2, [poly, #2*distance/4]
-  str.w poly3, [poly, #3*distance/4]
-  str.w poly0, [poly], #4
-
-  // ITER 1-12
-  add.w tmp, poly, #strincr*3*(3+1)
-  vmov s14, tmp
-  3:
-    add.w tmp, poly, #strincr*3
-    vmov s13, tmp
-    2:
-      // polys upto 6q
-      load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
-      load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
-
-
-      _3_layer_double_inv_CT_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, qinv, q, tmp, tmp2
-
-      vmov.w s2, poly
-      movw poly, #:lower16:5585133
-      movt poly, #:upper16:5585133
-
-      // twisting
-      _3_layer_double_inv_twist_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, poly, q, tmp, tmp2
-
-      vmov.w poly, s2
-
-      store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
-      str.w poly1, [poly, #distance/4]
-      str.w poly2, [poly, #2*distance/4]
-      str.w poly3, [poly, #3*distance/4]
-      str.w poly0, [poly], #4
-
-      vmov tmp, s13
-      cmp.w poly, tmp
-    bne.w 2b
-
-    // polys upto 9q
-    load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
-    load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
-
-    _3_layer_double_inv_CT_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, qinv, q, tmp, tmp2
-
-    vmov.w s2, poly
-    movw poly, #:lower16:5585133
-    movt poly, #:upper16:5585133
-
-    // twisting
-    _3_layer_double_inv_twist_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, poly, q, tmp, tmp2
-
-    vmov.w poly, s2
-
-    store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
-    str.w poly1, [poly, #distance/4]
-    str.w poly2, [poly, #2*distance/4]
-    str.w poly3, [poly, #3*distance/4]
-    str.w poly0, [poly], #4
-
-    vmov tmp, s14
-    cmp.w poly, tmp
-  bne.w 3b
-
-  // ITER 13-15
-  add tmp, poly, #3*strincr
-  vmov s13, tmp
-  2:
-    // polys upto 6q
-    load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
-    load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
-
-    _3_layer_double_inv_CT_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, qinv, q, tmp, tmp2
-
-    vmov.w s2, poly
-    movw poly, #:lower16:5585133
-    movt poly, #:upper16:5585133
-
-    // twisting
-    _3_layer_double_inv_twist_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, poly, q, tmp, tmp2
-
-    vmov.w poly, s2
-
-    store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
-    str.w poly1, [poly, #distance/4]
-    str.w poly2, [poly, #2*distance/4]
-    str.w poly3, [poly, #3*distance/4]
-    str.w poly0, [poly], #strincr
-
-    vmov tmp, s13
-    cmp.w poly, tmp
-  bne.w 2b
-
-  pop {r4-r11, pc}
-
-.unreq poly
-.unreq twiddle_ptr
-.unreq poly0
-.unreq poly1
-.unreq poly2
-.unreq poly3
-.unreq poly4
-.unreq poly5
-.unreq poly6
-.unreq poly7
-.unreq twiddle
-.unreq qinv
-.unreq q
-.unreq tmp
-.unreq tmp2
-
-.align 2
-.global small_pointmul_asm
-.type small_pointmul_asm, %function
-small_pointmul_asm:
-    push.w {r4-r11, lr}
-
-    movw r14, #769
-    movt r14, #767
-
-    .equ width, 4
-
-    add.w r12, r2, #64*2
-    _point_mul_16_loop:
-
-    ldr.w r7, [r1, #2*width]
-    ldr.w r8, [r1, #3*width]
-    ldrsh.w r9, [r2, #1*2]
-    ldr.w r5, [r1, #1*width]
-    ldr.w r4, [r1], #4*width
-    ldrsh.w r6, [r2], #2*2
-
-    smultb r10, r4, r6
-    montgomery r14, r14, r10, r11
-    pkhbt r4, r4, r11
-
-
-    neg.w r6, r6
-
-    smultb r10, r5, r6
-    montgomery r14, r14, r10, r11
-    pkhbt r5, r5, r11
-
-    str.w r5, [r0, #1*width]
-    str.w r4, [r0], #2*width
-
-    smultb r10, r7, r9
-    montgomery r14, r14, r10, r11
-    pkhbt r7, r7, r11
-
-    neg.w r9, r9
-
-    smultb r10, r8, r9
-    montgomery r14, r14, r10, r11
-    pkhbt r8, r8, r11
-
-    str.w r8, [r0, #1*width]
-    str.w r7, [r0], #2*width
-
-    cmp.w r2, r12
-    bne.w _point_mul_16_loop
-
-    pop.w {r4-r11, pc}
-
-  .align 2
-.global small_asymmetric_mul_asm
-.type small_asymmetric_mul_asm, %function
-small_asymmetric_mul_asm:
-    push.w {r4-r11, lr}
-
-    movw r14, #769
-    movt r14, #767
-    .equ width, 4
-    add.w r12, r0, #256*2
-    _asymmetric_mul_16_loop:
-    ldr.w r7, [r1, #width]
-    ldr.w r4, [r1], #2*width
-    ldr.w r8, [r2, #width]
-    ldr.w r5, [r2], #2*width
-    ldr.w r9, [r3, #width]
-    ldr.w r6, [r3], #2*width
-
-    smuad r10, r4, r6
-    montgomery r14, r14, r10, r6
-    smuadx r11, r4, r5
-    montgomery r14, r14, r11, r10
-
-    pkhtb r10, r10, r6, asr#16
-
-    str.w r10, [r0], #width
-
-    smuad r10, r7, r9
-    montgomery r14, r14, r10, r6
-    smuadx r11, r7, r8
-    montgomery r14, r14, r11, r10
-
-    pkhtb r10, r10, r6, asr#16
-    str.w r10, [r0], #width
-
-
-    cmp.w r0, r12
-    bne.w _asymmetric_mul_16_loop
-
-    pop.w {r4-r11, pc}
\ No newline at end of file
diff --git a/src/dilithium/smallntt.h b/src/dilithium/smallntt.h
deleted file mode 100644
index 0aa0ce9..0000000
--- a/src/dilithium/smallntt.h
+++ /dev/null
@@ -1,53 +0,0 @@
-#ifndef SMALLNTT_H
-#define SMALLNTT_H
-
-#include <stdint.h>
-#include "params.h"
-
-static const int16_t zetas[64] = {
--23, 112, -151, -134, -52, -148, 227, 232,
--71, 212, 236, 21, 341, 379, -202, -220,
-352, 292, 238, 145, 194, -276, 70, -274,
-117, 333, 66, 247, -237, -83, -252, -244,
-331, -241, 167, 357, -355, 291, -358, 105, -115, -209, 14, 99, -260, 29, 366, -378, -318, 278, 353, 354, -184, 127, 330, -303, 222, -78, -348, -44, 201, 158, 350, 168
-};
-
-static const int16_t zetas_asm[128] = {
-0, -164, -81, 361, 186, -3, -250, -120, -308, 129, -16, -223, -362, -143, 131, -337,
--76, 147, -114, -23, 112, -151, -134,
--98, -272, 54, -52, -148, 227, 232,
-36, -2, -124, -71, 212, 236, 21,
--75, -80, -346, 341, 379, -202, -220,
--339, 86, -51, 352, 292, 238, 145,
--255, 364, 267, 194, -276, 70, -274,
-282, 161, -15, 117, 333, 66, 247,
--203, 288, 169, -237, -83, -252, -244,
--34, 191, 307, 331, -241, 167, 357,
-199, -50, -24, -355, 291, -358, 105,
-178, -170, 226, -115, -209, 14, 99,
-270, 121, -188, -260, 29, 366, -378,
--10, -380, 279, -318, 278, 353, 354,
-149, 180, -375, -184, 127, 330, -303,
-369, -157, 263, 222, -78, -348, -44,
--192, -128, -246, 201, 158, 350, 168
-};
-
-static const int16_t zetas_inv_CT_asm[256] = {
-0, 171, 171, 164, 171, -361, 164, 81, 171, 120, -361, 3, 164, 250, 81, -186,
-171, 164, 171, -361, 164, 81, -257, 49, -141, -18, -215, 38, 283, 347, 337, 192, -369, 246, -263, 128, 157, 239, -264, 179, 301, -207, 219, -332, -206, 120, 337, -131, 192, -149, -369, 10, 62, 57, 40, 136, 1, 311, -173, 27, 223, 203, -282, -169, 15, -288, -161, 74, -56, 271, -309, 26, -373, 116, -67, -361, 120, 250, 337, 143, -131, 362, -383, 82, 125, -344, -93, 299, -60, -204, 143, -270, -178, 188, -226, -121, 170, 39, -175, 174, 284, -111, 84, -22, 79, 3, 223, 16, 203, 255, -282, 339, 245, 64, -90, -306, 190, -123, 197, -253, -129, 75, -36, 346, 124, 80, 2, 218, 126, -33, -266, 326, -122, -261, 343, 164, -361, 81, 120, 3, 250, -186, 285, 200, -89, 5, 17, -96, 135, -310, -131, -149, 10, 375, -279, -180, 380, -280, -183, -7, 130, -327, -189, -335, -370, 250, 143, 362, -270, -199, -178, 34, -359, -144, -182, 304, -43, -300, -251, 377, 16, 255, 339, -267, 51, -364, -86, -106, 101, -118, 214, -349, -110, -374, -195, 81, 3, -186, 223, -129, 16, 308, 320, 319, 8, 181, 154, 216, 273, 313, 362, -199, 34, 24, -307, 50, -191, -139, -165, 208, 92, 159, 233, 177, -321, -186, -129, 308, 75, 98, -36, 76, 231, 324, 25, 85, 289, -94, -12, 113, 308, 98, 76, -54, 114, 272, -147, -146, -35, -119, -97, -176, -137, -312, -138,
-};
-
-
-#define SMALL_Q 769
-
-void small_ntt_asm(int16_t a[N], const int16_t * zetas);
-void small_invntt_tomont_asm(int16_t a[N], const int16_t * zetas);
-void small_pointmul_asm(int16_t out[N], const int16_t in[N], const int16_t *zetas);
-void small_asymmetric_mul_asm(int16_t c[256], const int16_t a[256], const int16_t b[256], const int16_t b_prime[256]);
-
-#define small_ntt(a) small_ntt_asm(a, zetas_asm)
-#define small_invntt_tomont(a) small_invntt_tomont_asm(a, zetas_inv_CT_asm)
-#define small_point_mul(out, in) small_pointmul_asm(out, in, zetas)
-#define small_asymmetric_mul(c, a, b, b_prime) small_asymmetric_mul_asm(c, a, b, b_prime);
-
-#endif
diff --git a/src/dilithium/smallpoly.c b/src/dilithium/smallpoly.c
deleted file mode 100644
index 79bd5ca..0000000
--- a/src/dilithium/smallpoly.c
+++ /dev/null
@@ -1,84 +0,0 @@
-#include "smallpoly.h"
-#include "smallntt.h"
-
-void poly_small_ntt_precomp(smallpoly *out, smallhalfpoly *out2, poly *in) {
-  for (int i = 0; i < N; i++)
-  {
-    out->coeffs[i] = in->coeffs[i];
-  }
-  small_ntt((int16_t*)out->coeffs);
-  small_point_mul(out2->coeffs, (const int16_t*)out->coeffs);
-}
-
-
-void polyvecl_small_ntt(smallpoly v[L]) {
-  unsigned int i;
-
-  for(i = 0; i < L; ++i)
-    small_ntt((int16_t*)v[i].coeffs);
-}
-
-
-void polyveck_small_ntt(smallpoly v[K]) {
-  unsigned int i;
-
-  for(i = 0; i < K; ++i)
-    small_ntt((int16_t*)v[i].coeffs);
-}
-
-
-
-void poly_small_basemul_invntt(poly *r, const smallpoly *a, const smallhalfpoly *aprime, const smallpoly *b){
-    // re-use the buffer
-    smallpoly *tmp = (smallpoly *)r;
-    small_asymmetric_mul((int16_t*)tmp->coeffs, (const int16_t*)b->coeffs, (const int16_t*)a->coeffs, aprime->coeffs);
-    small_invntt_tomont((int16_t*)tmp->coeffs);
-
-    #ifdef SMALL_POLY_16_BIT
-    int j;
-    // buffer is the same, so we neeed to be careful
-    for(j=N-1;j>=0;j--){
-        r->coeffs[j] = tmp->coeffs[j];
-    }
-    #endif
-}
-
-void polyvecl_small_basemul_invntt(polyvecl *r, const smallpoly *a, const smallhalfpoly *aprime, const smallpoly b[L]){
-    unsigned int i;
-    for(i=0;i<L;i++){
-        poly_small_basemul_invntt(&r->vec[i], a, aprime, &b[i]);
-    }
-}
-
-void small_polyeta_unpack(smallpoly *r, const uint8_t *a) {
-  unsigned int i;
-
-#if ETA == 2
-  for(i = 0; i < N/8; ++i) {
-    r->coeffs[8*i+0] =  (a[3*i+0] >> 0) & 7;
-    r->coeffs[8*i+1] =  (a[3*i+0] >> 3) & 7;
-    r->coeffs[8*i+2] = ((a[3*i+0] >> 6) | (a[3*i+1] << 2)) & 7;
-    r->coeffs[8*i+3] =  (a[3*i+1] >> 1) & 7;
-    r->coeffs[8*i+4] =  (a[3*i+1] >> 4) & 7;
-    r->coeffs[8*i+5] = ((a[3*i+1] >> 7) | (a[3*i+2] << 1)) & 7;
-    r->coeffs[8*i+6] =  (a[3*i+2] >> 2) & 7;
-    r->coeffs[8*i+7] =  (a[3*i+2] >> 5) & 7;
-
-    r->coeffs[8*i+0] = ETA - r->coeffs[8*i+0];
-    r->coeffs[8*i+1] = ETA - r->coeffs[8*i+1];
-    r->coeffs[8*i+2] = ETA - r->coeffs[8*i+2];
-    r->coeffs[8*i+3] = ETA - r->coeffs[8*i+3];
-    r->coeffs[8*i+4] = ETA - r->coeffs[8*i+4];
-    r->coeffs[8*i+5] = ETA - r->coeffs[8*i+5];
-    r->coeffs[8*i+6] = ETA - r->coeffs[8*i+6];
-    r->coeffs[8*i+7] = ETA - r->coeffs[8*i+7];
-  }
-#elif ETA == 4
-  for(i = 0; i < N/2; ++i) {
-    r->coeffs[2*i+0] = a[i] & 0x0F;
-    r->coeffs[2*i+1] = a[i] >> 4;
-    r->coeffs[2*i+0] = ETA - r->coeffs[2*i+0];
-    r->coeffs[2*i+1] = ETA - r->coeffs[2*i+1];
-  }
-#endif
-}
diff --git a/src/dilithium/smallpoly.h b/src/dilithium/smallpoly.h
deleted file mode 100644
index caa2626..0000000
--- a/src/dilithium/smallpoly.h
+++ /dev/null
@@ -1,39 +0,0 @@
-#ifndef SMALLPOLY_H
-#define SMALLPOLY_H
-#include "params.h"
-#include "poly.h"
-#include "polyvec.h"
-
-
-
-#if DILITHIUM_MODE == 3 // use q=769
-#define SMALL_POLY_16_BIT
-typedef struct {
-    int16_t coeffs[N];
-} smallpoly;
-
-typedef smallpoly smallhalfpoly;
-
-#else // use q=257
-#define SMALL_POLY_32_BIT
-typedef struct {
-    int32_t coeffs[N];
-} smallpoly;
-
-typedef struct {
-    int16_t coeffs[N];
-} smallhalfpoly;
-#endif
-
-
-void poly_small_ntt_precomp(smallpoly *out, smallhalfpoly *out2, poly *in);
-void polyvecl_small_ntt(smallpoly v[L]);
-void polyveck_small_ntt(smallpoly v[K]);
-
-
-void polyvecl_small_basemul_invntt(polyvecl *r, const smallpoly *a, const smallhalfpoly *aprime, const smallpoly b[L]);
-void poly_small_basemul_invntt(poly *r, const smallpoly *a, const smallhalfpoly *aprime, const smallpoly *b);
-
-void small_polyeta_unpack(smallpoly *r, const uint8_t *a);
-
-#endif
\ No newline at end of file
diff --git a/src/dilithium/symmetric-shake.c b/src/dilithium/symmetric-shake.c
deleted file mode 100644
index 963f649..0000000
--- a/src/dilithium/symmetric-shake.c
+++ /dev/null
@@ -1,28 +0,0 @@
-#include <stdint.h>
-#include "params.h"
-#include "symmetric.h"
-#include "fips202.h"
-
-void dilithium_shake128_stream_init(shake128incctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce)
-{
-  uint8_t t[2];
-  t[0] = nonce;
-  t[1] = nonce >> 8;
-
-  shake128_inc_init(state);
-  shake128_inc_absorb(state, seed, SEEDBYTES);
-  shake128_inc_absorb(state, t, 2);
-  shake128_inc_finalize(state);
-}
-
-void dilithium_shake256_stream_init(shake256incctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce)
-{
-  uint8_t t[2];
-  t[0] = nonce;
-  t[1] = nonce >> 8;
-
-  shake256_inc_init(state);
-  shake256_inc_absorb(state, seed, CRHBYTES);
-  shake256_inc_absorb(state, t, 2);
-  shake256_inc_finalize(state);
-}
diff --git a/src/dilithium/symmetric.h b/src/dilithium/symmetric.h
deleted file mode 100644
index 297c745..0000000
--- a/src/dilithium/symmetric.h
+++ /dev/null
@@ -1,67 +0,0 @@
-#ifndef SYMMETRIC_H
-#define SYMMETRIC_H
-
-#include <stdint.h>
-#include "params.h"
-
-#ifdef DILITHIUM_USE_AES
-
-#include "aes256ctr.h"
-#include "fips202.h"
-
-typedef aes256ctr_ctx stream128_state;
-typedef aes256ctr_ctx stream256_state;
-
-#define dilithium_aes256ctr_init DILITHIUM_NAMESPACE(dilithium_aes256ctr_init)
-void dilithium_aes256ctr_init(aes256ctr_ctx *state,
-                              const uint8_t key[32],
-                              uint16_t nonce);
-
-#define STREAM128_BLOCKBYTES AES256CTR_BLOCKBYTES
-#define STREAM256_BLOCKBYTES AES256CTR_BLOCKBYTES
-
-#define crh(OUT, IN, INBYTES) shake256(OUT, CRHBYTES, IN, INBYTES)
-#define stream128_init(STATE, SEED, NONCE) \
-        dilithium_aes256ctr_init(STATE, SEED, NONCE)
-#define stream128_squeezeblocks(OUT, OUTBLOCKS, STATE) \
-        aes256ctr_squeezeblocks(OUT, OUTBLOCKS, STATE)
-#define stream256_init(STATE, SEED, NONCE) \
-        dilithium_aes256ctr_init(STATE, SEED, NONCE)
-#define stream256_squeezeblocks(OUT, OUTBLOCKS, STATE) \
-        aes256ctr_squeezeblocks(OUT, OUTBLOCKS, STATE)
-
-#else
-
-#include "fips202.h"
-typedef shake128incctx stream128_state;
-typedef shake256incctx stream256_state;
-
-#define shake256_inc_squeezeblocks(OUT, OUTBLOCKS, STATE) \
-        shake256_inc_squeeze(OUT, OUTBLOCKS*SHAKE256_RATE, STATE)
-
-#define dilithium_shake128_stream_init DILITHIUM_NAMESPACE(dilithium_shake128_stream_init)
-void dilithium_shake128_stream_init(stream128_state *state,
-                                    const uint8_t seed[SEEDBYTES],
-                                    uint16_t nonce);
-
-#define dilithium_shake256_stream_init DILITHIUM_NAMESPACE(dilithium_shake256_stream_init)
-void dilithium_shake256_stream_init(stream256_state *state,
-                                    const uint8_t seed[CRHBYTES],
-                                    uint16_t nonce);
-
-#define STREAM128_BLOCKBYTES SHAKE128_RATE
-#define STREAM256_BLOCKBYTES SHAKE256_RATE
-
-#define crh(OUT, IN, INBYTES) shake256(OUT, CRHBYTES, IN, INBYTES)
-#define stream128_init(STATE, SEED, NONCE) \
-        dilithium_shake128_stream_init(STATE, SEED, NONCE)
-#define stream128_squeezeblocks(OUT, OUTBLOCKS, STATE) \
-        shake128_inc_squeeze(OUT, OUTBLOCKS*SHAKE128_RATE, STATE)
-#define stream256_init(STATE, SEED, NONCE) \
-        dilithium_shake256_stream_init(STATE, SEED, NONCE)
-#define stream256_squeezeblocks(OUT, OUTBLOCKS, STATE) \
-        shake256_inc_squeeze(OUT, OUTBLOCKS*SHAKE256_RATE, STATE)
-
-#endif
-
-#endif
diff --git a/src/dilithium/vector.h b/src/dilithium/vector.h
deleted file mode 100644
index 233c6a0..0000000
--- a/src/dilithium/vector.h
+++ /dev/null
@@ -1,18 +0,0 @@
-#ifndef VECTOR_H
-#define VECTOR_H
-
-#include <stdint.h>
-#include "params.h"
-
-#define asm_reduce32 DILITHIUM_NAMESPACE(asm_reduce32)
-void asm_reduce32(int32_t a[N]);
-#define asm_caddq DILITHIUM_NAMESPACE(asm_caddq)
-void asm_caddq(int32_t a[N]);
-#define asm_freeze DILITHIUM_NAMESPACE(asm_freeze)
-void asm_freeze(int32_t a[N]);
-#define asm_rej_uniform DILITHIUM_NAMESPACE(asm_rej_uniform)
-unsigned int asm_rej_uniform(int32_t *a,
-                         unsigned int len,
-                         const unsigned char *buf,
-                         unsigned int buflen);
-#endif
diff --git a/src/dilithium/vector.s b/src/dilithium/vector.s
deleted file mode 100644
index d3eb720..0000000
--- a/src/dilithium/vector.s
+++ /dev/null
@@ -1,191 +0,0 @@
-.syntax unified
-.thumb
-.macro redq a, tmp, q
-    add     \tmp, \a,  #4194304
-    asrs    \tmp, \tmp, #23
-    mls     \a, \tmp, \q, \a
-.endm
-
-// void asm_reduce32(int32_t a[N]);
-.global pqcrystals_dilithium_asm_reduce32
-.type pqcrystals_dilithium_asm_reduce32, %function
-.align 2
-pqcrystals_dilithium_asm_reduce32:
-    push {r4-r10}
-
-    movw r12,#:lower16:8380417
-    movt r12,#:upper16:8380417
-    movw r10, #32
-    1:
-        ldr.w r1, [r0]
-        ldr.w r2, [r0, #1*4]
-        ldr.w r3, [r0, #2*4]
-        ldr.w r4, [r0, #3*4]
-        ldr.w r5, [r0, #4*4]
-        ldr.w r6, [r0, #5*4]
-        ldr.w r7, [r0, #6*4]
-        ldr.w r8, [r0, #7*4]
-
-        redq r1, r9, r12
-        redq r2, r9, r12
-        redq r3, r9, r12
-        redq r4, r9, r12
-        redq r5, r9, r12
-        redq r6, r9, r12
-        redq r7, r9, r12
-        redq r8, r9, r12
-
-        str.w r2, [r0, #1*4]
-        str.w r3, [r0, #2*4]
-        str.w r4, [r0, #3*4]
-        str.w r5, [r0, #4*4]
-        str.w r6, [r0, #5*4]
-        str.w r7, [r0, #6*4]
-        str.w r8, [r0, #7*4]
-        str r1, [r0], #8*4
-        subs r10, #1
-        bne.w 1b
-
-    pop {r4-r10}
-    bx lr
-.size pqcrystals_dilithium_asm_reduce32, .-pqcrystals_dilithium_asm_reduce32
-
-.macro caddq a, tmp, q
-    and     \tmp, \q, \a, asr #31
-    add     \a, \a, \tmp
-.endm
-
-.macro freezeq a, tmp, q
-    redq \a, \tmp, \q
-    caddq \a, \tmp, \q
-.endm
-
-// // void asm_freeze(int32_t a[N]);
-// .global pqcrystals_dilithium_asm_freeze
-// .type pqcrystals_dilithium_asm_freeze, %function
-// .align 2
-// pqcrystals_dilithium_asm_freeze:
-//     push {r4-r10}
-
-//     movw r12,#:lower16:8380417
-//     movt r12,#:upper16:8380417
-
-//     movw r10, #32
-//     1:
-//         ldr.w r1, [r0]
-//         ldr.w r2, [r0, #1*4]
-//         ldr.w r3, [r0, #2*4]
-//         ldr.w r4, [r0, #3*4]
-//         ldr.w r5, [r0, #4*4]
-//         ldr.w r6, [r0, #5*4]
-//         ldr.w r7, [r0, #6*4]
-//         ldr.w r8, [r0, #7*4]
-
-//         freezeq r1, r9, r12
-//         freezeq r2, r9, r12
-//         freezeq r3, r9, r12
-//         freezeq r4, r9, r12
-//         freezeq r5, r9, r12
-//         freezeq r6, r9, r12
-//         freezeq r7, r9, r12
-//         freezeq r8, r9, r12
-
-//         str.w r2, [r0, #1*4]
-//         str.w r3, [r0, #2*4]
-//         str.w r4, [r0, #3*4]
-//         str.w r5, [r0, #4*4]
-//         str.w r6, [r0, #5*4]
-//         str.w r7, [r0, #6*4]
-//         str.w r8, [r0, #7*4]
-//         str r1, [r0], #8*4
-//         subs r10, #1
-//         bne.w 1b
-
-//     pop {r4-r10}
-//     bx lr
-// .size pqcrystals_dilithium_asm_freeze, .-pqcrystals_dilithium_asm_freeze
-
-// void asm_caddq(int32_t a[N]);
-.global pqcrystals_dilithium_asm_caddq
-.type pqcrystals_dilithium_asm_caddq, %function
-.align 2
-pqcrystals_dilithium_asm_caddq:
-    push {r4-r10}
-
-    movw r12,#:lower16:8380417
-    movt r12,#:upper16:8380417
-
-    movw r10, #32
-    1:
-        ldr.w r1, [r0]
-        ldr.w r2, [r0, #1*4]
-        ldr.w r3, [r0, #2*4]
-        ldr.w r4, [r0, #3*4]
-        ldr.w r5, [r0, #4*4]
-        ldr.w r6, [r0, #5*4]
-        ldr.w r7, [r0, #6*4]
-        ldr.w r8, [r0, #7*4]
-
-        caddq r1, r9, r12
-        caddq r2, r9, r12
-        caddq r3, r9, r12
-        caddq r4, r9, r12
-        caddq r5, r9, r12
-        caddq r6, r9, r12
-        caddq r7, r9, r12
-        caddq r8, r9, r12
-
-        str.w r2, [r0, #1*4]
-        str.w r3, [r0, #2*4]
-        str.w r4, [r0, #3*4]
-        str.w r5, [r0, #4*4]
-        str.w r6, [r0, #5*4]
-        str.w r7, [r0, #6*4]
-        str.w r8, [r0, #7*4]
-        str r1, [r0], #8*4
-        subs r10, #1
-        bne.w 1b
-
-    pop {r4-r10}
-    bx lr
-.size pqcrystals_dilithium_asm_caddq, .-pqcrystals_dilithium_asm_caddq
-
-
-// asm_rej_uniform(int32_t *a,unsigned int len,const unsigned char *buf, unsigned int buflen);
-.global pqcrystals_dilithium_asm_rej_uniform
-.type pqcrystals_dilithium_asm_rej_uniform, %function
-.align 2
-pqcrystals_dilithium_asm_rej_uniform:
-    push.w {r4-r6}
-    push.w {r1}
-    // Store Q-1 in r12.
-    movw r12,#:lower16:8380416
-    movt r12,#:upper16:8380416
-
-    add.w r6, r0, r1, lsl #2
-    add.w r3, r2, r3
-    sub.w r3, r3, #2
-
-1:
-    // If there are less than 3 bytes available, return.
-    cmp.w r3, r2
-    ble.w end
-
-    ldr r5, [r2], #3
-    ubfx r5, r5, #0, #23
-
-    cmp.n r5, r12
-    it le
-    strle r5, [r0], #4
-
-    cmp.n r0, r6
-    bne.n 1b
-
-end:
-    pop.w {r5}
-
-    sub.w r0, r6, r0
-    sub.w r0, r5, r0, lsr #2
-    pop.w {r4-r6}
-    bx lr
-.size pqcrystals_dilithium_asm_rej_uniform, .-pqcrystals_dilithium_asm_rej_uniform
diff --git a/src/dilithium/wrapper.c b/src/dilithium/wrapper.c
deleted file mode 100755
index 942ae78..0000000
--- a/src/dilithium/wrapper.c
+++ /dev/null
@@ -1,51 +0,0 @@
-#include "./wrapper.h"
-#include "./params.h"
-#include "./api.h"
-#include "./sign.h"
-#include "./poly.h"
-#include <string.h>
-
-#if DILITHIUM_PUBLIC_KEY_SIZE != CRYPTO_PUBLICKEYBYTES
-#error invalid public key size, update me!
-#endif
-#if DILITHIUM_PRIVATE_KEY_SIZE != CRYPTO_SECRETKEYBYTES
-#error invalid private key size, update me!
-#endif
-#if DILITHIUM_SIGNATURE_SIZE != CRYPTO_BYTES
-#error invalid signature size, update me!
-#endif
-#if DILITHIUM_N != N
-#error invalid N, update me!
-#endif
-
-int getDilithiumAlgorithmVariant() {
-	return DILITHIUM_MODE;
-}
-
-uint8_t* DilithiumState_getPrivateKey(DilithiumState* self) {
-	return self->m_sk;
-}
-
-uint8_t* DilithiumState_getPublicKey(DilithiumState* self) {
-	return self->m_pk;
-}
-
-uint8_t* DilithiumState_getScratchPad(DilithiumState* self) {
-	return self->m_scratchpad;
-}
-
-int DilithiumState_verify(const DilithiumState* self, uint8_t *signedMessage) {
-	size_t messageLength = DILITHIUM_MESSAGE_SIZE;
-	return crypto_sign_open(signedMessage, &messageLength, signedMessage, DILITHIUM_SIGNED_MESSAGE_SIZE, self->m_pk);
-}
-
-int DilithiumState_sign(const DilithiumState* self, uint8_t* signature, const uint8_t* message) {
-	size_t signatureSize = DILITHIUM_SIGNATURE_SIZE;
-	return crypto_sign_signature(signature, &signatureSize, message, DILITHIUM_MESSAGE_SIZE, self->m_sk);
-}
-
-int Dilithium_ntt(uint32_t* coefficients) {
-	poly* coeffs = (poly*)coefficients;
-	poly_ntt(coeffs);
-	return 0;
-}
diff --git a/src/dilithium/wrapper.h b/src/dilithium/wrapper.h
deleted file mode 100755
index 5952b0e..0000000
--- a/src/dilithium/wrapper.h
+++ /dev/null
@@ -1,93 +0,0 @@
-#ifndef _DILITHIUM_WRAPPER_H_
-#define _DILITHIUM_WRAPPER_H_
-
-#include <stdint.h>
-#include <stddef.h>
-
-#define DILITHIUM_PUBLIC_KEY_SIZE 1952
-#define DILITHIUM_PRIVATE_KEY_SIZE 4016
-#define DILITHIUM_SIGNATURE_SIZE 3293
-#define DILITHIUM_MESSAGE_SIZE 16
-#define DILITHIUM_N 256
-#define DILITHIUM_SIGNED_MESSAGE_SIZE (DILITHIUM_SIGNATURE_SIZE + DILITHIUM_MESSAGE_SIZE)
-
-/**
- * @brief      Get the dilithium algorithm variant. There are a few variants and
- *             only one of them is implemented.
- *
- * @return     The dilithium algorithm variant.
- */
-int getDilithiumAlgorithmVariant();
-
-/**
- * Simple object-oriented wrapper around the various Dilithium functions.
- * All "methods" start with the prefix "DilithiumState_".
- */
-typedef struct DilithiumState_t {
-	uint8_t m_pk[DILITHIUM_PUBLIC_KEY_SIZE];
-	uint8_t m_sk[DILITHIUM_PRIVATE_KEY_SIZE];
-    uint8_t m_scratchpad[DILITHIUM_SIGNED_MESSAGE_SIZE];
-} DilithiumState;
-
-/**
- * @brief      Get the private key bytes.
- *
- * @param[in]  self  The object
- *
- * @return     The private key bytes.
- */
-uint8_t* DilithiumState_getPrivateKey(DilithiumState* self);
-
-/**
- * @brief      Get the public key bytes.
- *
- * @param[in]  self  The object
- *
- * @return     The public key bytes.
- */
-uint8_t* DilithiumState_getPublicKey(DilithiumState* self);
-
-/**
- * @brief      Get the "scratch pad" for message storage, signature storage.
- *
- * @param      self  The object
- *
- * @return     Pointer to the scratch pad.
- */
-uint8_t* DilithiumState_getScratchPad(DilithiumState* self);
-
-/**
- * @brief      Verify a signed message.
- *
- * @param[in]  self           The object
- * @param[in]  signature      Buffer of the signed message. This buffer MUST have
- *                            length DILITHIUM_SIGNATURE_SIZE + DILITHIUM_MESSAGE_SIZE.
- *
- * @return     0 when verification passes, non-zero otherwise.
- */
-int DilithiumState_verify(const DilithiumState* self, uint8_t *signedMessage);
-
-/**
- * @brief      Sign a message.
- *
- * @param[in]  self           The object
- * @param[out] signature      Buffer where the signature will be placed in. This
- *                            buffer MUST have length DILITHIUM_SIGNATURE_SIZE.
- * @param[in]  message        Buffer of the message to be signed. This buffer MUST
- *                            have length DILITHIUM_MESSAGE_SIZE.
- *
- * @return     0 when signing succeeds, non-zero otherwise.
- */
-int DilithiumState_sign(const DilithiumState* self, uint8_t* signature, const uint8_t* message);
-
-///
-/// @brief        Perform a forward NTT.
-///
-/// @param[inout] coefficients  Buffer of polynomial coefficients in integer
-///                             domain. The computation is done in-place, and
-///                             this array contains the coefficients in the
-///                             frequency domain after this function returns.
-///
-int Dilithium_ntt(uint32_t *coefficients);
-
-#endif // _DILITHIUM_WRAPPER_H_
diff --git a/src/kyber512/CMakeLists.txt b/src/kyber512/CMakeLists.txt
deleted file mode 100644
index 284b2b0..0000000
--- a/src/kyber512/CMakeLists.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-target_licensed_sources(
-  polyvec.c
-  api.h
-  verify.h
-  reduce.S
-  kem.c
-  indcpa.h
-  fastbasemul.S
-  cbd.h
-  randombytes.h
-  cbd.c
-  indcpa.c
-  poly.c
-  symmetric.h
-  polyvec.h
-  poly.h
-  params.h
-  ntt.c
-  wrapper.h
-  fastinvntt.S
-  ntt.h
-  fastntt.S
-  wrapper.c
-  symmetric-fips202.c
-  fastaddsub.S
-  verify.c
-)
diff --git a/src/kyber512/api.h b/src/kyber512/api.h
deleted file mode 100644
index 3b9244a..0000000
--- a/src/kyber512/api.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifndef API_H
-#define API_H
-
-#include "params.h"
-
-#define CRYPTO_SECRETKEYBYTES  KYBER_SECRETKEYBYTES
-#define CRYPTO_PUBLICKEYBYTES  KYBER_PUBLICKEYBYTES
-#define CRYPTO_CIPHERTEXTBYTES KYBER_CIPHERTEXTBYTES
-#define CRYPTO_BYTES           KYBER_SSBYTES
-
-#define CRYPTO_ALGNAME "Kyber512"
-
-int crypto_kem_keypair(unsigned char *pk, unsigned char *sk);
-
-int crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk);
-
-int crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk);
-
-
-#endif
diff --git a/src/kyber512/cbd.c b/src/kyber512/cbd.c
deleted file mode 100644
index f8911fc..0000000
--- a/src/kyber512/cbd.c
+++ /dev/null
@@ -1,112 +0,0 @@
-#include "cbd.h"
-#include "params.h"
-
-#include <stdint.h>
-
-/*************************************************
-* Name:        load32_littleendian
-*
-* Description: load bytes into a 32-bit integer
-*              in little-endian order
-*
-* Arguments:   - const unsigned char *x: pointer to input byte array
-*
-* Returns 32-bit unsigned integer loaded from x
-**************************************************/
-static uint32_t load32_littleendian(const unsigned char *x) {
-    uint32_t r;
-    r  = (uint32_t)x[0];
-    r |= (uint32_t)x[1] << 8;
-    r |= (uint32_t)x[2] << 16;
-    r |= (uint32_t)x[3] << 24;
-    return r;
-}
-
-/*************************************************
-* Name:        load24_littleendian
-*
-* Description: load 3 bytes into a 32-bit integer
-*              in little-endian order
-*              This function is only needed for Kyber-512
-*
-* Arguments:   - const uint8_t *x: pointer to input byte array
-*
-* Returns 32-bit unsigned integer loaded from x (most significant byte is zero)
-**************************************************/
-static uint32_t load24_littleendian(const uint8_t x[3])
-{
-  uint32_t r;
-  r  = (uint32_t)x[0];
-  r |= (uint32_t)x[1] << 8;
-  r |= (uint32_t)x[2] << 16;
-  return r;
-}
-
-
-
-/*************************************************
-* Name:        cbd_eta1
-*
-* Description: Given an array of uniformly random bytes, compute
-*              polynomial with coefficients distributed according to
-*              a centered binomial distribution with parameter KYBER_ETA1
-*              specialized for KYBER_ETA1=3
-*
-* Arguments:   - poly *r:                  pointer to output polynomial
-*              - const unsigned char *buf: pointer to input byte array
-*              - int add:                  boolean to indicate to accumulate into r
-**************************************************/
-void cbd_eta1(poly *r, const unsigned char *buf, int add) {
-  unsigned int i,j;
-  uint32_t t,d;
-  int16_t a,b;
-
-  for(i=0;i<KYBER_N/4;i++) {
-    t  = load24_littleendian(buf+3*i);
-    d  = t & 0x00249249;
-    d += (t>>1) & 0x00249249;
-    d += (t>>2) & 0x00249249;
-
-    for(j=0;j<4;j++) {
-      a = (d >> (6*j+0)) & 0x7;
-      b = (d >> (6*j+3)) & 0x7;
-      if (!add)
-        r->coeffs[4 * i + j] = 0;
-      r->coeffs[4 * i + j] = r->coeffs[4 * i + j] + (a - b);
-    }
-  }
-}
-
-/*************************************************
-* Name:        cbd_eta2
-*
-* Description: Given an array of uniformly random bytes, compute
-*              polynomial with coefficients distributed according to
-*              a centered binomial distribution with parameter KYBER_ETA2
-*              specialized for KYBER_ETA2=2
-*
-* Arguments:   - poly *r:                  pointer to output polynomial
-*              - const unsigned char *buf: pointer to input byte array
-*              - int add:                  boolean to indicate to accumulate into r
-**************************************************/
-void cbd_eta2(poly *r, const unsigned char *buf, int add) {
-    uint32_t d, t;
-    int16_t a, b;
-    int i, j;
-
-    for (i = 0; i < KYBER_N / 8; i++) {
-        t = load32_littleendian(buf + 4 * i);
-        d  = t & 0x55555555;
-        d += (t >> 1) & 0x55555555;
-
-        for (j = 0; j < 8; j++) {
-            a = (d >>  4 * j)    & 0x3;
-            b = (d >> (4 * j + 2)) & 0x3;
-            if (!add)
-              r->coeffs[8 * i + j] = 0;
-            r->coeffs[8 * i + j] = r->coeffs[8 * i + j] + (a - b);
-        }
-    }
-}
-
-
diff --git a/src/kyber512/cbd.h b/src/kyber512/cbd.h
deleted file mode 100644
index 47f1d24..0000000
--- a/src/kyber512/cbd.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef CBD_H
-#define CBD_H
-
-#include "poly.h"
-
-void cbd_eta1(poly *r, const unsigned char *buf, int add);
-void cbd_eta2(poly *r, const unsigned char *buf, int add);
-
-#endif
diff --git a/src/kyber512/fastaddsub.S b/src/kyber512/fastaddsub.S
deleted file mode 100644
index 0d4ae50..0000000
--- a/src/kyber512/fastaddsub.S
+++ /dev/null
@@ -1,60 +0,0 @@
-.syntax unified
-.cpu cortex-m4
-.thumb
-
-.align 2
-.global pointwise_sub
-.type pointwise_sub, %function
-pointwise_sub:
-  push {r4-r11, lr}
-
-  movw r14, #25
-  1:
-    ldm r1!, {r3-r7}
-    ldm r2!, {r8-r12}
-    usub16 r3, r3, r8
-    usub16 r4, r4, r9
-    usub16 r5, r5, r10
-    usub16 r6, r6, r11
-    usub16 r7, r7, r12
-    stm r0!, {r3-r7}
-
-    subs.w r14, #1
-  bne.w 1b
-
-  ldm r1!, {r3-r5}
-  ldm r2!, {r8-r10}
-  usub16 r3, r3, r8
-  usub16 r4, r4, r9
-  usub16 r5, r5, r10
-  stm r0!, {r3-r5}
-  pop {r4-r11, pc}
-
-
-.align 2
-.global pointwise_add
-.type pointwise_add, %function
-pointwise_add:
-  push {r4-r11, lr}
-
-  movw r14, #25
-  1:
-    ldm r1!, {r3-r7}
-    ldm r2!, {r8-r12}
-    uadd16 r3, r3, r8
-    uadd16 r4, r4, r9
-    uadd16 r5, r5, r10
-    uadd16 r6, r6, r11
-    uadd16 r7, r7, r12
-    stm r0!, {r3-r7}
-
-    subs.w r14, #1
-  bne.w 1b
-
-  ldm r1!, {r3-r5}
-  ldm r2!, {r8-r10}
-  uadd16 r3, r3, r8
-  uadd16 r4, r4, r9
-  uadd16 r5, r5, r10
-  stm r0!, {r3-r5}
-  pop {r4-r11, pc}
diff --git a/src/kyber512/fastbasemul.S b/src/kyber512/fastbasemul.S
deleted file mode 100644
index 8d07d60..0000000
--- a/src/kyber512/fastbasemul.S
+++ /dev/null
@@ -1,275 +0,0 @@
-.syntax unified
-.cpu cortex-m4
-.thumb
-
-.macro montgomery q, qinv, a, tmp
-  smulbt \tmp, \a, \qinv
-  smlabb \tmp, \q, \tmp, \a
-.endm
-
-
-.global doublebasemul_asm
-.type doublebasemul_asm, %function
-.align 2
-doublebasemul_asm:
-  push {r4-r11, lr}
-
-  rptr  .req r0
-  aptr  .req r1
-  bptr  .req r2
-  zeta  .req r3
-  poly0 .req r4
-  poly1 .req r6
-  poly2 .req r5
-  poly3 .req r7
-  q     .req r8
-  qinv  .req r8
-  tmp   .req r9
-  tmp2  .req r10
-  tmp3  .req r11
-
-  movw  q, #3329
-  movt qinv, #3327
-
-  ldrd poly0, poly2, [aptr], #8
-  ldrd poly1, poly3, [bptr], #8
-
-  //basemul(r->coeffs + 4 * i, a->coeffs + 4 * i, b->coeffs + 4 * i, zetas[64 + i]);
-  smultt tmp, poly0, poly1
-  montgomery q, qinv, tmp, tmp2
-  smultb tmp2, tmp2, zeta
-  smlabb tmp2, poly0, poly1, tmp2
-  montgomery q, qinv, tmp2, tmp
-  // r[0] in upper half of tmp
-
-  smuadx tmp2, poly0, poly1
-  montgomery q, qinv, tmp2, tmp3
-  // r[1] in upper half of tmp3
-  pkhtb tmp, tmp3, tmp, asr#16
-  str tmp, [rptr], #4
-
-  neg zeta, zeta
-
-  //basemul(r->coeffs + 4 * i + 2, a->coeffs + 4 * i + 2, b->coeffs + 4 * i + 2, - zetas[64 + i]);
-  smultt tmp, poly2, poly3
-  montgomery q, qinv, tmp, tmp2
-  smultb tmp2, tmp2, zeta
-  smlabb tmp2, poly2, poly3, tmp2
-  montgomery q, qinv, tmp2, tmp
-  // r[0] in upper half of tmp
-
-  smuadx tmp2, poly2, poly3
-  montgomery q, qinv, tmp2, tmp3
-  // r[1] in upper half of tmp3
-  pkhtb tmp, tmp3, tmp, asr#16
-  str tmp, [rptr], #4
-
-  pop {r4-r11, pc}
-
-
-.global doublebasemul_asm_acc
-.type doublebasemul_asm_acc, %function
-.align 2
-doublebasemul_asm_acc:
-  push {r4-r11, lr}
-
-  rptr  .req r0
-  aptr  .req r1
-  bptr  .req r2
-  zeta  .req r3
-  poly0 .req r4
-  poly1 .req r6
-  poly2 .req r5
-  poly3 .req r7
-  q     .req r8
-  qinv  .req r8
-  tmp   .req r9
-  tmp2  .req r10
-  tmp3  .req r11
-  r0r1  .req r12
-  r2r3  .req r14
-
-  movw  q, #3329
-  movt qinv, #3327
-
-  ldrd poly0, poly2, [aptr], #8
-  ldrd poly1, poly3, [bptr], #8
-  ldrd r0r1, r2r3, [rptr]
-
-  //basemul(r->coeffs + 4 * i, a->coeffs + 4 * i, b->coeffs + 4 * i, zetas[64 + i]);
-  smultt tmp, poly0, poly1
-  montgomery q, qinv, tmp, tmp2
-  smultb tmp2, tmp2, zeta
-  smlabb tmp2, poly0, poly1, tmp2
-  montgomery q, qinv, tmp2, tmp
-  // r[0] in upper half of tmp
-
-  smuadx tmp2, poly0, poly1
-  montgomery q, qinv, tmp2, tmp3
-  // r[1] in upper half of tmp3
-  pkhtb tmp, tmp3, tmp, asr#16
-  uadd16 r0r1, r0r1, tmp
-  str r0r1, [rptr], #4
-
-  neg zeta, zeta
-
-  //basemul(r->coeffs + 4 * i + 2, a->coeffs + 4 * i + 2, b->coeffs + 4 * i + 2, - zetas[64 + i]);
-  smultt tmp, poly2, poly3
-  montgomery q, qinv, tmp, tmp2
-  smultb tmp2, tmp2, zeta
-  smlabb tmp2, poly2, poly3, tmp2
-  montgomery q, qinv, tmp2, tmp
-  // r[0] in upper half of tmp
-
-  smuadx tmp2, poly2, poly3
-  montgomery q, qinv, tmp2, tmp3
-  // r[1] in upper half of tmp3
-  pkhtb tmp, tmp3, tmp, asr#16
-  uadd16 r2r3, r2r3, tmp
-  str r2r3, [rptr], #4
-
-  pop {r4-r11, pc}
-
-
-.global basemul_asm
-.type basemul_asm, %function
-.align 2
-basemul_asm:
-    push {r4-r11, lr}
-
-    .unreq zeta
-    rptr    .req r0
-    aptr    .req r1
-    bptr    .req r2
-    zetaptr .req r3
-    poly0   .req r4
-    poly1   .req r6
-    poly2   .req r5
-    poly3   .req r7
-    q       .req r8
-    qinv    .req r8
-    tmp     .req r9
-    tmp2    .req r10
-    tmp3    .req r11
-    zeta    .req r12
-    loop    .req r14
-
-    movw q, #3329
-    movt qinv, #3327
-
-    movw loop, #64
-    1:
-      ldr poly0, [aptr], #4
-      ldr poly1, [bptr], #4
-      ldr poly2, [aptr], #4
-      ldr poly3, [bptr], #4
-
-      ldrh zeta, [zetaptr], #2
-
-      // basemul(r->coeffs + 4 * i, a->coeffs + 4 * i, b->coeffs + 4 * i, zetas[64 + i]);
-      smultt tmp, poly0, poly1
-      montgomery q, qinv, tmp, tmp2
-      smultb tmp2, tmp2, zeta
-      smlabb tmp2, poly0, poly1, tmp2
-      montgomery q, qinv, tmp2, tmp
-      // r[0] in upper half of tmp
-
-      smuadx tmp2, poly0, poly1
-      montgomery q, qinv, tmp2, tmp3
-      // r[1] in upper half of tmp3
-      pkhtb tmp, tmp3, tmp, asr#16
-      str tmp, [rptr], #4
-
-      neg zeta, zeta
-
-      // basemul(r->coeffs + 4 * i + 2, a->coeffs + 4 * i + 2, b->coeffs + 4 * i + 2, - zetas[64 + i]);
-      smultt tmp, poly2, poly3
-      montgomery q, qinv, tmp, tmp2
-      smultb tmp2, tmp2, zeta
-      smlabb tmp2, poly2, poly3, tmp2
-      montgomery q, qinv, tmp2, tmp
-      // r[0] in upper half of tmp
-
-      smuadx tmp2, poly2, poly3
-      montgomery q, qinv, tmp2, tmp3
-      // r[1] in upper half of tmp3
-      pkhtb tmp, tmp3, tmp, asr#16
-      str tmp, [rptr], #4
-
-      subs.w loop, #1
-    bne.w 1b
-
-    pop {r4-r11, pc}
-
-
-.global basemul_asm_acc
-.type basemul_asm_acc, %function
-.align 2
-basemul_asm_acc:
-    push {r4-r11, lr}
-
-    rptr    .req r0
-    aptr    .req r1
-    bptr    .req r2
-    zetaptr .req r3
-    poly0   .req r4
-    poly1   .req r6
-    poly2   .req r5
-    poly3   .req r7
-    q       .req r8
-    qinv    .req r8
-    tmp     .req r9
-    tmp2    .req r10
-    tmp3    .req r11
-    zeta    .req r12
-    loop    .req r14
-
-    movw q, #3329
-    movt qinv, #3327
-
-    movw loop, #64
-    1:
-
-      ldrd poly0, poly2, [aptr], #8
-      ldrd poly1, poly3, [bptr], #8
-
-      ldrh zeta, [zetaptr], #2
-
-      //basemul(r->coeffs + 4 * i, a->coeffs + 4 * i, b->coeffs + 4 * i, zetas[64 + i]);
-      smultt tmp, poly0, poly1
-      montgomery q, qinv, tmp, tmp2
-      smultb tmp2, tmp2, zeta
-      smlabb tmp2, poly0, poly1, tmp2
-      montgomery q, qinv, tmp2, tmp
-      // r[0] in upper half of tmp
-
-      smuadx tmp2, poly0, poly1
-      montgomery q, qinv, tmp2, tmp3
-      // r[1] in upper half of tmp3
-      pkhtb tmp, tmp3, tmp, asr#16
-      ldr tmp3, [rptr]
-      uadd16 tmp, tmp, tmp3
-      str tmp, [rptr], #4
-
-      neg zeta, zeta
-
-      //basemul(r->coeffs + 4 * i + 2, a->coeffs + 4 * i + 2, b->coeffs + 4 * i + 2, - zetas[64 + i]);
-      smultt tmp, poly2, poly3
-      montgomery q, qinv, tmp, tmp2
-      smultb tmp2, tmp2, zeta
-      smlabb tmp2, poly2, poly3, tmp2
-      montgomery q, qinv, tmp2, tmp
-      // r[0] in upper half of tmp
-
-      smuadx tmp2, poly2, poly3
-      montgomery q, qinv, tmp2, tmp3
-      // r[1] in upper half of tmp3
-      pkhtb tmp, tmp3, tmp, asr#16
-      ldr tmp3, [rptr]
-      uadd16 tmp, tmp, tmp3
-      str tmp, [rptr], #4
-
-      subs.w loop, #1
-    bne.w 1b
-
-    pop {r4-r11, pc}
diff --git a/src/kyber512/fastinvntt.S b/src/kyber512/fastinvntt.S
deleted file mode 100644
index 7e72577..0000000
--- a/src/kyber512/fastinvntt.S
+++ /dev/null
@@ -1,186 +0,0 @@
-#include "macros.i"
-
-.syntax unified
-.cpu cortex-m4
-.thumb
-
-.macro doubleinvbutterfly tb, a0, a1, twiddle, tmp, tmp2, q, qinv
-  usub16 \tmp, \a0, \a1
-  uadd16 \a0, \a0, \a1
-
-  smulb\tb \a1, \tmp, \twiddle
-  smult\tb \tmp, \tmp, \twiddle
-  montgomery \q, \qinv, \a1, \tmp2
-  montgomery \q, \qinv, \tmp, \a1
-  pkhtb \a1, \a1, \tmp2, asr#16
-.endm
-
-.macro two_doubleinvbutterfly tb1, tb2, a0, a1, a2, a3, twiddle, tmp, tmp2, q, qinv
-  doubleinvbutterfly \tb1, \a0, \a1, \twiddle, \tmp, \tmp2, \q, \qinv
-  doubleinvbutterfly \tb2, \a2, \a3, \twiddle, \tmp, \tmp2, \q, \qinv
-.endm
-
-.macro fqmulprecomp a, twiddle, tmp, tmp2, q, qinv
-  smulbt \tmp, \a, \twiddle
-  smultt \a, \a, \twiddle
-  montgomery \q, \qinv, \a, \tmp2
-  montgomery \q, \qinv, \tmp, \a
-  pkhtb \a, \tmp2, \a, asr#16
-.endm
-
-.macro fullmontgomery a0, a1, a2, a3, tmp, tmp2, q, qinv, montconst
-  movw \montconst, #2285
-  doublemontgomery \a0, \tmp, \tmp2, \q, \qinv, \montconst
-  doublemontgomery \a1, \tmp, \tmp2, \q, \qinv, \montconst
-  doublemontgomery \a2, \tmp, \tmp2, \q, \qinv, \montconst
-  doublemontgomery \a3, \tmp, \tmp2, \q, \qinv, \montconst
-.endm
-
-.global invntt_fast
-.type invntt_fast, %function
-.align 2
-invntt_fast:
-  push {r4-r11, lr}
-
-  poly        .req r0
-  twiddle_ptr .req r1
-  poly0       .req r2
-  poly1       .req r3
-  poly2       .req r4
-  poly3       .req r5
-  poly4       .req r6
-  poly5       .req r7
-  poly6       .req r8
-  poly7       .req r9
-  twiddle     .req r10
-  montconst   .req r10
-  qinv        .req r11
-  q           .req r11
-  tmp         .req r12
-  tmp2        .req r14
-
-  movw q, #3329
-  movt qinv, #3327
-
-  ### LAYER 1 (skip layer 0)
-  movw tmp, #16
-  1:
-    push {tmp}
-
-    ldm poly, {poly0-poly7}
-
-    ldr.w twiddle, [twiddle_ptr], #4
-    two_doubleinvbutterfly b, t, poly0, poly1, poly2, poly3, twiddle, tmp, tmp2, q, qinv
-    ldr.w twiddle, [twiddle_ptr], #4
-    two_doubleinvbutterfly b, t, poly4, poly5, poly6, poly7, twiddle, tmp, tmp2, q, qinv
-
-    fullmontgomery poly0, poly2, poly4, poly6, tmp, tmp2, q, qinv, montconst
-
-    stm poly!, {poly0-poly7}
-
-    pop {tmp}
-    subs.w tmp, #1
-  bne.w 1b
-
-  sub.w poly, #512
-
-  .equ distance, 32
-
-  ### LAYER 2+3+4
-  movw tmp, #8
-  2:
-    push {tmp}
-
-    load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
-    load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
-
-    ldr.w twiddle, [twiddle_ptr]
-    two_doubleinvbutterfly b, t, poly0, poly1, poly2, poly3, twiddle, tmp, tmp2, q, qinv
-    ldr.w twiddle, [twiddle_ptr, #4]
-    two_doubleinvbutterfly b, t, poly4, poly5, poly6, poly7, twiddle, tmp, tmp2, q, qinv
-
-    ldr.w twiddle, [twiddle_ptr, #8]
-    two_doubleinvbutterfly b, b, poly0, poly2, poly1, poly3, twiddle, tmp, tmp2, q, qinv
-    two_doubleinvbutterfly t, t, poly4, poly6, poly5, poly7, twiddle, tmp, tmp2, q, qinv
-
-    ldrh twiddle, [twiddle_ptr, #12]
-    two_doubleinvbutterfly b, b, poly0, poly4, poly1, poly5, twiddle, tmp, tmp2, q, qinv
-    two_doubleinvbutterfly b, b, poly2, poly6, poly3, poly7, twiddle, tmp, tmp2, q, qinv
-
-    fullmontgomery poly0, poly1, poly2, poly3, tmp, tmp2, q, qinv, montconst
-
-    store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
-    str.w poly1, [poly, #distance/4]
-    str.w poly2, [poly, #2*distance/4]
-    str.w poly3, [poly, #3*distance/4]
-    str.w poly0, [poly], #4
-
-    load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
-    load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
-
-    ldr.w twiddle, [twiddle_ptr], #4
-    two_doubleinvbutterfly b, t, poly0, poly1, poly2, poly3, twiddle, tmp, tmp2, q, qinv
-    ldr.w twiddle, [twiddle_ptr], #4
-    two_doubleinvbutterfly b, t, poly4, poly5, poly6, poly7, twiddle, tmp, tmp2, q, qinv
-
-    ldr.w twiddle, [twiddle_ptr], #4
-    two_doubleinvbutterfly b, b, poly0, poly2, poly1, poly3, twiddle, tmp, tmp2, q, qinv
-    two_doubleinvbutterfly t, t, poly4, poly6, poly5, poly7, twiddle, tmp, tmp2, q, qinv
-
-    ldrh twiddle, [twiddle_ptr], #2
-    two_doubleinvbutterfly b, b, poly0, poly4, poly1, poly5, twiddle, tmp, tmp2, q, qinv
-    two_doubleinvbutterfly b, b, poly2, poly6, poly3, poly7, twiddle, tmp, tmp2, q, qinv
-
-	  fullmontgomery poly0, poly1, poly2, poly3, tmp, tmp2, q, qinv, montconst
-
-    store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
-    str.w poly1, [poly, #distance/4]
-    str.w poly2, [poly, #2*distance/4]
-    str.w poly3, [poly, #3*distance/4]
-    str.w poly0, [poly], #60
-
-    pop {tmp}
-    subs.w tmp, #1
-  bne.w 2b
-
-  sub.w poly, #512
-
-  .equ distance, 8*distance
-
-  ### LAYER 5+6+7
-  movw tmp, #16
-  3:
-    push {tmp}
-
-    load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
-    load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
-
-    ldr.w twiddle, [twiddle_ptr]
-    two_doubleinvbutterfly b, t, poly0, poly1, poly2, poly3, twiddle, tmp, tmp2, q, qinv
-    ldr.w twiddle, [twiddle_ptr, #4]
-    two_doubleinvbutterfly b, t, poly4, poly5, poly6, poly7, twiddle, tmp, tmp2, q, qinv
-
-    ldr.w twiddle, [twiddle_ptr, #8]
-    two_doubleinvbutterfly b, b, poly0, poly2, poly1, poly3, twiddle, tmp, tmp2, q, qinv
-    two_doubleinvbutterfly t, t, poly4, poly6, poly5, poly7, twiddle, tmp, tmp2, q, qinv
-
-    ldr.w twiddle, [twiddle_ptr, #12]
-    two_doubleinvbutterfly b, b, poly0, poly4, poly1, poly5, twiddle, tmp, tmp2, q, qinv
-    two_doubleinvbutterfly b, b, poly2, poly6, poly3, poly7, twiddle, tmp, tmp2, q, qinv
-
-    fqmulprecomp poly0, twiddle, tmp, tmp2, q, qinv
-    fqmulprecomp poly1, twiddle, tmp, tmp2, q, qinv
-    fqmulprecomp poly2, twiddle, tmp, tmp2, q, qinv
-    fqmulprecomp poly3, twiddle, tmp, tmp2, q, qinv
-
-    store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
-    str.w poly1, [poly, #distance/4]
-    str.w poly2, [poly, #2*distance/4]
-    str.w poly3, [poly, #3*distance/4]
-    str.w poly0, [poly], #4
-
-    pop {tmp}
-    subs.w tmp, #1
-  bne.w 3b
-
-  pop {r4-r11, pc}
diff --git a/src/kyber512/fastntt.S b/src/kyber512/fastntt.S
deleted file mode 100644
index 1c34b83..0000000
--- a/src/kyber512/fastntt.S
+++ /dev/null
@@ -1,163 +0,0 @@
-#include "macros.i"
-
-.syntax unified
-.cpu cortex-m4
-.thumb
-
-.macro doublebutterfly tb, a0, a1, twiddle, tmp, tmp2, q, qinv
-  smulb\tb \tmp, \a1, \twiddle
-  smult\tb \a1, \a1, \twiddle
-  montgomery \q, \qinv, \tmp, \tmp2
-  montgomery \q, \qinv, \a1, \tmp
-  pkhtb \tmp2, \tmp, \tmp2, asr#16
-  usub16 \a1, \a0, \tmp2
-  uadd16 \a0, \a0, \tmp2
-.endm
-
-.macro two_doublebutterfly tb1, tb2, a0, a1, a2, a3, twiddle, tmp, tmp2, q, qinv
-  doublebutterfly \tb1, \a0, \a1, \twiddle, \tmp, \tmp2, \q, \qinv
-  doublebutterfly \tb2, \a2, \a3, \twiddle, \tmp, \tmp2, \q, \qinv
-.endm
-
-.global ntt_fast
-.type ntt_fast, %function
-.align 2
-ntt_fast:
-  push {r4-r11, r14}
-
-  poly        .req r0
-  twiddle_ptr .req r1
-  poly0       .req r2
-  poly1       .req r3
-  poly2       .req r4
-  poly3       .req r5
-  poly4       .req r6
-  poly5       .req r7
-  poly6       .req r8
-  poly7       .req r9
-  twiddle     .req r10
-  barrettconst .req r10
-  qinv        .req r11
-  q           .req r11
-  tmp         .req r12
-  tmp2        .req r14
-
-  movw q, #3329
-  movt qinv, #3327
-
-  .equ barrett_constant, 20159
-  .equ distance, 256
-
-  ### LAYER 7+6+5
-  movw tmp, #16
-  1:
-    push {tmp}
-
-    load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
-    load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
-
-    ldrh twiddle, [twiddle_ptr]
-    two_doublebutterfly b, b, poly0, poly4, poly1, poly5, twiddle, tmp, tmp2, q, qinv
-    two_doublebutterfly b, b, poly2, poly6, poly3, poly7, twiddle, tmp, tmp2, q, qinv
-
-    ldr.w twiddle, [twiddle_ptr, #2]
-    two_doublebutterfly b, b, poly0, poly2, poly1, poly3, twiddle, tmp, tmp2, q, qinv
-    two_doublebutterfly t, t, poly4, poly6, poly5, poly7, twiddle, tmp, tmp2, q, qinv
-
-    ldr.w twiddle, [twiddle_ptr, #6]
-    two_doublebutterfly b, t, poly0, poly1, poly2, poly3, twiddle, tmp, tmp2, q, qinv
-
-    ldr.w twiddle, [twiddle_ptr, #10]
-    two_doublebutterfly b, t, poly4, poly5, poly6, poly7, twiddle, tmp, tmp2, q, qinv
-
-    store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
-    str.w poly1, [poly, #distance/4]
-    str.w poly2, [poly, #2*distance/4]
-    str.w poly3, [poly, #3*distance/4]
-    str.w poly0, [poly], #4
-
-    pop {tmp}
-    subs.w tmp, #1
-  bne.w 1b
-
-  sub.w poly, #64
-  add.w twiddle_ptr, #14
-
-  .equ distance, distance/8
-
-  ### LAYER 4+3+2
-  movw tmp, #8
-  2:
-    push {tmp}
-
-    movw tmp, #2
-    3:
-      push {tmp}
-
-      load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
-      load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
-
-
-      ldrh twiddle, [twiddle_ptr]
-      two_doublebutterfly b, b, poly0, poly4, poly1, poly5, twiddle, tmp, tmp2, q, qinv
-      two_doublebutterfly b, b, poly2, poly6, poly3, poly7, twiddle, tmp, tmp2, q, qinv
-
-      ldr.w twiddle, [twiddle_ptr, #2]
-      two_doublebutterfly b, b, poly0, poly2, poly1, poly3, twiddle, tmp, tmp2, q, qinv
-      two_doublebutterfly t, t, poly4, poly6, poly5, poly7, twiddle, tmp, tmp2, q, qinv
-
-      ldr.w twiddle, [twiddle_ptr, #6]
-      two_doublebutterfly b, t, poly0, poly1, poly2, poly3, twiddle, tmp, tmp2, q, qinv
-
-      ldr.w twiddle, [twiddle_ptr, #10]
-      two_doublebutterfly b, t, poly4, poly5, poly6, poly7, twiddle, tmp, tmp2, q, qinv
-
-      store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
-      str.w poly1, [poly, #distance/4]
-      str.w poly2, [poly, #2*distance/4]
-      str.w poly3, [poly, #3*distance/4]
-      str.w poly0, [poly], #4
-
-      pop {tmp}
-      subs.w tmp, #1
-    bne.w 3b
-
-    add.w poly, #56
-    add.w twiddle_ptr, #14
-
-    pop {tmp}
-    subs.w tmp, #1
-  bne.w 2b
-
-  sub poly, #512
-
-  ### LAYER 1 (skip layer 0)
-  movw tmp, #16
-  4:
-    push {tmp}
-
-    ldm poly, {poly0-poly7}
-
-    ldr.w twiddle, [twiddle_ptr], #4
-    two_doublebutterfly b, t, poly0, poly1, poly2, poly3, twiddle, tmp, tmp2, q, qinv
-
-    ldr.w twiddle, [twiddle_ptr], #4
-    two_doublebutterfly b, t, poly4, poly5, poly6, poly7, twiddle, tmp, tmp2, q, qinv
-
-    movw barrettconst, #barrett_constant
-    doublebarrett poly0, tmp, tmp2, q, barrettconst
-    doublebarrett poly1, tmp, tmp2, q, barrettconst
-    doublebarrett poly2, tmp, tmp2, q, barrettconst
-    doublebarrett poly3, tmp, tmp2, q, barrettconst
-    doublebarrett poly4, tmp, tmp2, q, barrettconst
-    doublebarrett poly5, tmp, tmp2, q, barrettconst
-    doublebarrett poly6, tmp, tmp2, q, barrettconst
-    doublebarrett poly7, tmp, tmp2, q, barrettconst
-
-    stm poly!, {poly0-poly7}
-
-    pop {tmp}
-    subs.w tmp, #1
-  bne.w 4b
-
-  pop {r4-r11, pc}
diff --git a/src/kyber512/indcpa.c b/src/kyber512/indcpa.c
deleted file mode 100644
index acc83b8..0000000
--- a/src/kyber512/indcpa.c
+++ /dev/null
@@ -1,276 +0,0 @@
-#include "indcpa.h"
-#include "ntt.h"
-#include "poly.h"
-#include "polyvec.h"
-#include "randombytes.h"
-#include "symmetric.h"
-
-#include <string.h>
-#include <stdint.h>
-
-extern void doublebasemul_asm_acc(int16_t *r, const int16_t *a, const int16_t *b, int16_t zeta);
-/*************************************************
-* Name:        matacc
-*
-* Description: Multiplies a row of A or A^T, generated on-the-fly,
-*              with a vector of polynomials and accumulates into the result.
-*
-* Arguments:   - poly *r:                    pointer to output polynomial to accumulate in
-*              - polyvec *b:                 pointer to input vector of polynomials to multiply with
-*              - unsigned char i:            byte to indicate the index < KYBER_K of the row of A or A^T
-*              - const unsigned char *seed:  pointer to the public seed used to generate A
-*              - int transposed:             boolean indicatin whether A or A^T is generated
-**************************************************/
-static void matacc(poly* r, polyvec *b, unsigned char i, const unsigned char *seed, int transposed) {
-  unsigned char buf[XOF_BLOCKBYTES+2];
-  unsigned int buflen, off;
-  xof_state state;
-  unsigned int ctr, pos, k, l;
-  uint16_t val0, val1;
-  int16_t c[4];
-
-  poly_zeroize(r);
-
-  for(int j=0;j<KYBER_K;j++) {
-    ctr = pos = 0;
-    if (transposed)
-      xof_absorb(&state, seed, i, j);
-    else
-      xof_absorb(&state, seed, j, i);
-
-    xof_squeezeblocks(buf, 1, &state);
-    buflen = XOF_BLOCKBYTES;
-
-    k = 0;
-    while (ctr < KYBER_N/4)
-    {
-      val0 = ((buf[pos+0] >> 0) | ((uint16_t)buf[pos+1] << 8)) & 0xFFF;
-      val1 = ((buf[pos+1] >> 4) | ((uint16_t)buf[pos+2] << 4)) & 0xFFF;
-      pos += 3;
-
-      if (val0 < KYBER_Q) {
-        c[k++] = (int16_t) val0;
-        if (k == 4) {
-          doublebasemul_asm_acc(&r->coeffs[4*ctr], &b->vec[j].coeffs[4*ctr], c, zetas[ctr]);
-          ctr++;
-          k = 0;
-        }
-      }
-
-      if (val1 < KYBER_Q && ctr < KYBER_Q/4) {
-        c[k++] = (int16_t) val1;
-        if (k == 4) {
-          doublebasemul_asm_acc(&r->coeffs[4*ctr], &b->vec[j].coeffs[4*ctr], c, zetas[ctr]);
-          ctr++;
-          k = 0;
-        }
-      }
-
-      if (pos + 3 > buflen && ctr < KYBER_Q/4) {
-        off = buflen % 3;
-        for(l = 0; l < off; l++)
-          buf[l] = buf[buflen - off + l];
-        xof_squeezeblocks(buf + off, 1, &state);
-        buflen = off + XOF_BLOCKBYTES;
-        pos = 0;
-      }
-    }
-  }
-}
-
-/*************************************************
-* Name:        indcpa_keypair
-*
-* Description: Generates public and private key for the CPA-secure
-*              public-key encryption scheme underlying Kyber
-*
-* Arguments:   - unsigned char *pk: pointer to output public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes)
-*              - unsigned char *sk: pointer to output private key (of length KYBER_INDCPA_SECRETKEYBYTES bytes)
-**************************************************/
-void indcpa_keypair(unsigned char *pk, unsigned char *sk) {
-    polyvec skpv;
-    poly pkp;
-    unsigned char buf[2 * KYBER_SYMBYTES];
-    unsigned char *publicseed = buf;
-    unsigned char *noiseseed = buf + KYBER_SYMBYTES;
-    int i;
-    unsigned char nonce = 0;
-
-    randombytes(buf, KYBER_SYMBYTES);
-    hash_g(buf, buf, KYBER_SYMBYTES);
-
-    for (i = 0; i < KYBER_K; i++)
-        poly_getnoise_eta1(skpv.vec + i, noiseseed, nonce++);
-
-    polyvec_ntt(&skpv);
-
-    for (i = 0; i < KYBER_K; i++) {
-        matacc(&pkp, &skpv, i, publicseed, 0);
-        poly_invntt(&pkp);
-
-        poly_addnoise_eta1(&pkp, noiseseed, nonce++);
-        poly_ntt(&pkp);
-
-        poly_tobytes(pk+i*KYBER_POLYBYTES, &pkp);
-    }
-
-    polyvec_tobytes(sk, &skpv);
-    memcpy(pk + KYBER_POLYVECBYTES, publicseed, KYBER_SYMBYTES); // Pack the public seed in the public key
-}
-
-/*************************************************
-* Name:        indcpa_enc
-*
-* Description: Encryption function of the CPA-secure
-*              public-key encryption scheme underlying Kyber.
-*
-* Arguments:   - unsigned char *c:          pointer to output ciphertext (of length KYBER_INDCPA_BYTES bytes)
-*              - const unsigned char *m:    pointer to input message (of length KYBER_INDCPA_MSGBYTES bytes)
-*              - const unsigned char *pk:   pointer to input public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes)
-*              - const unsigned char *coin: pointer to input random coins used as seed (of length KYBER_SYMBYTES bytes)
-*                                           to deterministically generate all randomness
-**************************************************/
-void indcpa_enc(unsigned char *c,
-               const unsigned char *m,
-               const unsigned char *pk,
-               const unsigned char *coins) {
-    polyvec sp;
-    poly bp;
-    poly *pkp = &bp;
-    poly *k = &bp;
-    poly *v = &sp.vec[0];
-    const unsigned char *seed = pk+KYBER_POLYVECBYTES;
-    int i;
-    unsigned char nonce = 0;
-
-    for (i = 0; i < KYBER_K; i++)
-        poly_getnoise_eta1(sp.vec + i, coins, nonce++);
-
-    polyvec_ntt(&sp);
-
-    for (i = 0; i < KYBER_K; i++) {
-        matacc(&bp, &sp, i, seed, 1);
-        poly_invntt(&bp);
-
-        poly_addnoise_eta2(&bp, coins, nonce++);
-        poly_reduce(&bp);
-
-        poly_packcompress(c, &bp, i);
-    }
-
-    poly_frombytes(pkp, pk);
-    poly_basemul(v, pkp, &sp.vec[0]);
-    for (i = 1; i < KYBER_K; i++) {
-        poly_frombytes(pkp, pk + i*KYBER_POLYBYTES);
-        poly_basemul_acc(v, pkp, &sp.vec[i]);
-    }
-
-    poly_invntt(v);
-
-    poly_addnoise_eta2(v, coins, nonce++);
-
-    poly_frommsg(k, m);
-    poly_add(v, v, k);
-    poly_reduce(v);
-
-    poly_compress(c + KYBER_POLYVECCOMPRESSEDBYTES, v);
-}
-
-/*************************************************
-* Name:        indcpa_enc_cmp
-*
-* Description: Re-encryption function.
-*              Compares the re-encypted ciphertext with the original ciphertext byte per byte.
-*              The comparison is performed in a constant time manner.
-*
-*
-* Arguments:   - unsigned char *ct:         pointer to input ciphertext to compare the new ciphertext with (of length KYBER_INDCPA_BYTES bytes)
-*              - const unsigned char *m:    pointer to input message (of length KYBER_INDCPA_MSGBYTES bytes)
-*              - const unsigned char *pk:   pointer to input public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes)
-*              - const unsigned char *coin: pointer to input random coins used as seed (of length KYBER_SYMBYTES bytes)
-*                                           to deterministically generate all randomness
-* Returns:     - boolean byte indicating that re-encrypted ciphertext is NOT equal to the original ciphertext
-**************************************************/
-unsigned char indcpa_enc_cmp(const unsigned char *c,
-                             const unsigned char *m,
-                             const unsigned char *pk,
-                             const unsigned char *coins) {
-    uint64_t rc = 0;
-    polyvec sp;
-    poly bp;
-    poly *pkp = &bp;
-    poly *k = &bp;
-    poly *v = &sp.vec[0];
-    const unsigned char *seed = pk+KYBER_POLYVECBYTES;
-    int i;
-    unsigned char nonce = 0;
-
-    for (i = 0; i < KYBER_K; i++)
-        poly_getnoise_eta1(sp.vec + i, coins, nonce++);
-
-    polyvec_ntt(&sp);
-
-    for (i = 0; i < KYBER_K; i++) {
-        matacc(&bp, &sp, i, seed, 1);
-        poly_invntt(&bp);
-
-        poly_addnoise_eta2(&bp, coins, nonce++);
-        poly_reduce(&bp);
-
-        rc |= cmp_poly_packcompress(c, &bp, i);
-    }
-
-    poly_frombytes(pkp, pk);
-    poly_basemul(v, pkp, &sp.vec[0]);
-    for (i = 1; i < KYBER_K; i++) {
-        poly_frombytes(pkp, pk + i*KYBER_POLYBYTES);
-        poly_basemul_acc(v, pkp, &sp.vec[i]);
-    }
-
-    poly_invntt(v);
-
-    poly_addnoise_eta2(v, coins, nonce++);
-    poly_frommsg(k, m);
-    poly_add(v, v, k);
-    poly_reduce(v);
-
-    rc |= cmp_poly_compress(c + KYBER_POLYVECCOMPRESSEDBYTES, v);
-
-    rc = ~rc + 1;
-    rc >>= 63;
-    return (unsigned char)rc;
-}
-
-/*************************************************
-* Name:        indcpa_dec
-*
-* Description: Decryption function of the CPA-secure
-*              public-key encryption scheme underlying Kyber.
-*
-* Arguments:   - unsigned char *m:        pointer to output decrypted message (of length KYBER_INDCPA_MSGBYTES)
-*              - const unsigned char *c:  pointer to input ciphertext (of length KYBER_INDCPA_BYTES)
-*              - const unsigned char *sk: pointer to input secret key (of length KYBER_INDCPA_SECRETKEYBYTES)
-**************************************************/
-void __attribute__ ((noinline)) indcpa_dec(unsigned char *m,
-                                           const unsigned char *c,
-                                           const unsigned char *sk) {
-    poly mp, bp;
-    poly *v = &bp;
-
-    poly_unpackdecompress(&mp, c, 0);
-    poly_ntt(&mp);
-    poly_frombytes_mul(&mp, sk);
-    for(int i = 1; i < KYBER_K; i++) {
-        poly_unpackdecompress(&bp, c, i);
-        poly_ntt(&bp);
-        poly_frombytes_mul(&bp, sk + i*KYBER_POLYBYTES);
-        poly_add(&mp, &mp, &bp);
-    }
-
-    poly_invntt(&mp);
-    poly_decompress(v, c+KYBER_POLYVECCOMPRESSEDBYTES);
-    poly_sub(&mp, v, &mp);
-    poly_reduce(&mp);
-
-    poly_tomsg(m, &mp);
-}
diff --git a/src/kyber512/indcpa.h b/src/kyber512/indcpa.h
deleted file mode 100644
index fcf6aa0..0000000
--- a/src/kyber512/indcpa.h
+++ /dev/null
@@ -1,21 +0,0 @@
-#ifndef INDCPA_H
-#define INDCPA_H
-
-void indcpa_keypair(unsigned char *pk,
-                    unsigned char *sk);
-
-void indcpa_enc(unsigned char *c,
-                const unsigned char *m,
-                const unsigned char *pk,
-                const unsigned char *coins);
-
-unsigned char indcpa_enc_cmp(const unsigned char *ct,
-                             const unsigned char *m,
-                             const unsigned char *pk,
-                             const unsigned char *coins);
-
-void indcpa_dec(unsigned char *m,
-                const unsigned char *c,
-                const unsigned char *sk);
-
-#endif
diff --git a/src/kyber512/kem.c b/src/kyber512/kem.c
deleted file mode 100644
index 31f16e3..0000000
--- a/src/kyber512/kem.c
+++ /dev/null
@@ -1,99 +0,0 @@
-#include "api.h"
-#include "indcpa.h"
-#include "params.h"
-#include "randombytes.h"
-#include "symmetric.h"
-#include "verify.h"
-
-#include <stdlib.h>
-
-#include <stdlib.h>
-
-/*************************************************
-* Name:        crypto_kem_keypair
-*
-* Description: Generates public and private key
-*              for CCA-secure Kyber key encapsulation mechanism
-*
-* Arguments:   - unsigned char *pk: pointer to output public key (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes)
-*              - unsigned char *sk: pointer to output private key (an already allocated array of CRYPTO_SECRETKEYBYTES bytes)
-*
-* Returns 0 (success)
-**************************************************/
-int crypto_kem_keypair(unsigned char *pk, unsigned char *sk) {
-    size_t i;
-    indcpa_keypair(pk, sk);
-    for (i = 0; i < KYBER_INDCPA_PUBLICKEYBYTES; i++) {
-        sk[i + KYBER_INDCPA_SECRETKEYBYTES] = pk[i];
-    }
-    hash_h(sk + KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES);
-    randombytes(sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES);    /* Value z for pseudo-random output on reject */
-    return 0;
-}
-
-/*************************************************
-* Name:        crypto_kem_enc
-*
-* Description: Generates cipher text and shared
-*              secret for given public key
-*
-* Arguments:   - unsigned char *ct:       pointer to output cipher text (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes)
-*              - unsigned char *ss:       pointer to output shared secret (an already allocated array of CRYPTO_BYTES bytes)
-*              - const unsigned char *pk: pointer to input public key (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes)
-*
-* Returns 0 (success)
-**************************************************/
-int crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk) {
-    unsigned char  kr[2 * KYBER_SYMBYTES];                                   /* Will contain key, coins */
-    unsigned char buf[2 * KYBER_SYMBYTES];
-
-    randombytes(buf, KYBER_SYMBYTES);
-    hash_h(buf, buf, KYBER_SYMBYTES);                                        /* Don't release system RNG output */
-
-    hash_h(buf + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES);                  /* Multitarget countermeasure for coins + contributory KEM */
-    hash_g(kr, buf, 2 * KYBER_SYMBYTES);
-
-    indcpa_enc(ct, buf, pk, kr + KYBER_SYMBYTES);                            /* coins are in kr+KYBER_SYMBYTES */
-
-    hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES);                  /* overwrite coins in kr with H(c) */
-    kdf(ss, kr, 2 * KYBER_SYMBYTES);                                         /* hash concatenation of pre-k and H(c) to k */
-    return 0;
-}
-
-/*************************************************
-* Name:        crypto_kem_dec
-*
-* Description: Generates shared secret for given
-*              cipher text and private key
-*
-* Arguments:   - unsigned char *ss:       pointer to output shared secret (an already allocated array of CRYPTO_BYTES bytes)
-*              - const unsigned char *ct: pointer to input cipher text (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes)
-*              - const unsigned char *sk: pointer to input private key (an already allocated array of CRYPTO_SECRETKEYBYTES bytes)
-*
-* Returns 0.
-*
-* On failure, ss will contain a pseudo-random value.
-**************************************************/
-int crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk) {
-    size_t i;
-    unsigned char fail;
-    unsigned char buf[2 * KYBER_SYMBYTES];
-    unsigned char kr[2 * KYBER_SYMBYTES];                                             /* Will contain key, coins */
-    const unsigned char *pk = sk + KYBER_INDCPA_SECRETKEYBYTES;
-
-    indcpa_dec(buf, ct, sk);
-
-    for (i = 0; i < KYBER_SYMBYTES; i++) {                                            /* Multitarget countermeasure for coins + contributory KEM */
-        buf[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES + i];  /* Save hash by storing H(pk) in sk */
-    }
-    hash_g(kr, buf, 2 * KYBER_SYMBYTES);
-
-    fail = indcpa_enc_cmp(ct, buf, pk, kr + KYBER_SYMBYTES);                          /* coins are in kr+KYBER_SYMBYTES */
-
-    hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES);                           /* overwrite coins in kr with H(c)  */
-
-    cmov(kr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES, fail);       /* Overwrite pre-k with z on re-encryption failure */
-
-    kdf(ss, kr, 2 * KYBER_SYMBYTES);                                                  /* hash concatenation of pre-k and H(c) to k */
-    return 0;
-}
diff --git a/src/kyber512/macros.i b/src/kyber512/macros.i
deleted file mode 100644
index f3a30b5..0000000
--- a/src/kyber512/macros.i
+++ /dev/null
@@ -1,42 +0,0 @@
-#ifndef MACROS_I
-#define MACROS_I
-
-.macro load a, a0, a1, a2, a3, mem0, mem1, mem2, mem3
-  ldr.w \a0, [\a, \mem0]
-  ldr.w \a1, [\a, \mem1]
-  ldr.w \a2, [\a, \mem2]
-  ldr.w \a3, [\a, \mem3]
-.endm
-
-.macro store a, a0, a1, a2, a3, mem0, mem1, mem2, mem3
-  str.w \a0, [\a, \mem0]
-  str.w \a1, [\a, \mem1]
-  str.w \a2, [\a, \mem2]
-  str.w \a3, [\a, \mem3]
-.endm
-
-.macro doublebarrett a, tmp, tmp2, q, barrettconst
-  smulbb \tmp, \a, \barrettconst
-  smultb \tmp2, \a, \barrettconst
-  asr \tmp, \tmp, #26
-  asr \tmp2, \tmp2, #26
-  smulbb \tmp, \tmp, \q
-  smulbb \tmp2, \tmp2, \q
-  pkhbt \tmp, \tmp, \tmp2, lsl#16
-  usub16 \a, \a, \tmp
-.endm
-
-.macro montgomery q, qinv, a, tmp
-  smulbt \tmp, \a, \qinv
-  smlabb \tmp, \q, \tmp, \a
-.endm
-
-.macro doublemontgomery a, tmp, tmp2, q, qinv, montconst
-  smulbb \tmp2, \a, \montconst
-  montgomery \q, \qinv, \tmp2, \tmp
-  smultb \a, \a, \montconst
-  montgomery \q, \qinv, \a, \tmp2
-  pkhtb \a, \tmp2, \tmp, asr#16
-.endm
-
-#endif /* MACROS_I */
diff --git a/src/kyber512/ntt.c b/src/kyber512/ntt.c
deleted file mode 100644
index e6f3d03..0000000
--- a/src/kyber512/ntt.c
+++ /dev/null
@@ -1,125 +0,0 @@
-#include "ntt.h"
-
-#include "params.h"
-
-#include <stdint.h>
-
-/* Code to generate zetas and zetas_inv used in the number-theoretic transform:
-
-#define KYBER_ROOT_OF_UNITY 17
-
-static const uint16_t tree[128] = {
-  0, 64, 32, 96, 16, 80, 48, 112, 8, 72, 40, 104, 24, 88, 56, 120,
-  4, 68, 36, 100, 20, 84, 52, 116, 12, 76, 44, 108, 28, 92, 60, 124,
-  2, 66, 34, 98, 18, 82, 50, 114, 10, 74, 42, 106, 26, 90, 58, 122,
-  6, 70, 38, 102, 22, 86, 54, 118, 14, 78, 46, 110, 30, 94, 62, 126,
-  1, 65, 33, 97, 17, 81, 49, 113, 9, 73, 41, 105, 25, 89, 57, 121,
-  5, 69, 37, 101, 21, 85, 53, 117, 13, 77, 45, 109, 29, 93, 61, 125,
-  3, 67, 35, 99, 19, 83, 51, 115, 11, 75, 43, 107, 27, 91, 59, 123,
-  7, 71, 39, 103, 23, 87, 55, 119, 15, 79, 47, 111, 31, 95, 63, 127};
-
-
-static int16_t fqmul(int16_t a, int16_t b) {
-  return montgomery_reduce((int32_t)a*b);
-}
-
-void init_ntt() {
-  unsigned int i, j, k;
-  int16_t tmp[128];
-
-  tmp[0] = MONT;
-  for(i = 1; i < 128; ++i)
-    tmp[i] = fqmul(tmp[i-1], KYBER_ROOT_OF_UNITY*MONT % KYBER_Q);
-
-  for(i = 0; i < 128; ++i)
-    zetas[i] = tmp[tree[i]];
-
-  k = 0;
-  for(i = 64; i >= 1; i >>= 1)
-    for(j = i; j < 2*i; ++j)
-      zetas_inv[k++] = -tmp[128 - tree[j]];
-
-  zetas_inv[127] = MONT * (MONT * (KYBER_Q - 1) * ((KYBER_Q - 1)/128) % KYBER_Q) % KYBER_Q;
-}
-
-*/
-
-const int16_t zetas[64] = { 2226, 430, 555, 843, 2078, 871, 1550, 105, 422, 587, 177, 3094, 3038, 2869,
-1574, 1653, 3083, 778, 1159, 3182, 2552, 1483, 2727, 1119, 1739, 644, 2457, 349,
-418, 329, 3173, 3254, 817, 1097, 603, 610, 1322, 2044, 1864, 384, 2114, 3193,
-1218, 1994, 2455, 220, 2142, 1670, 2144, 1799, 2051, 794, 1819, 2475, 2459, 478,
-3221, 3021, 996, 991, 958, 1869, 1522, 1628 };
-
-const int16_t zetas_asm[128] = {
-// 7 & 6 & 5 layers
-2571, 2970, 1812, 1493, 1422, 287, 202,
-// 1st loop of 4 & 3 & 2 layers
-3158, 573, 2004, 1223, 652, 2777, 1015,
-// 2nd loop of 4 & 3 & 2 layers
-622, 264, 383, 2036, 1491, 3047, 1785,
-// 3rd loop of 4 & 3 & 2 layers
-1577, 2500, 1458, 516, 3321, 3009, 2663,
-// 4th loop of 4 & 3 & 2 layers
-182, 1727, 3199, 1711, 2167, 126, 1469,
-// 5th loop of 4 & 3 & 2 layers
-962, 2648, 1017, 2476, 3239, 3058, 830,
-// 6th loop of 4 & 3 & 2 layers
-2127, 732, 608, 107, 1908, 3082, 2378,
-// 7th loop of 4 & 3 & 2 layers
-1855, 1787, 411, 2931, 961, 1821, 2604,
-// 8th loop of 4 & 3 & 2 layers
-1468, 3124, 1758, 448, 2264, 677, 2054,
-// 1 layer
-2226, 430, 555, 843, 2078, 871, 1550, 105, 422, 587, 177, 3094, 3038, 2869, 1574, 1653, 3083, 778, 1159, 3182, 2552, 1483, 2727, 1119, 1739, 644, 2457, 349, 418, 329, 3173, 3254, 817, 1097, 603, 610, 1322, 2044, 1864, 384, 2114, 3193, 1218, 1994, 2455, 220, 2142, 1670, 2144, 1799, 2051, 794, 1819, 2475, 2459, 478, 3221, 3021, 996, 991, 958, 1869, 1522, 1628,
-};
-
-const int16_t zetas_inv_asm[128] = {
-// 1 layer
-1701, 1807, 1460, 2371, 2338, 2333, 308, 108, 2851, 870, 854, 1510, 2535, 1278, 1530, 1185, 1659, 1187, 3109, 874, 1335, 2111, 136, 1215, 2945, 1465, 1285, 2007, 2719, 2726, 2232, 2512, 75, 156, 3000, 2911, 2980, 872, 2685, 1590, 2210, 602, 1846, 777, 147, 2170, 2551, 246, 1676, 1755, 460, 291, 235, 3152, 2742, 2907, 3224, 1779, 2458, 1251, 2486, 2774, 2899, 1103,
-// 1st loop of 2 & 3 & 4 layers
-1275, 2652, 1065, 2881, 1571, 205, 1861,
-// 2nd loop of 2 & 3 & 4 layers
-725, 1508, 2368, 398, 2918, 1542, 1474,
-// 3rd loop of 2 & 3 & 4 layers
-951, 247, 1421, 3222, 2721, 2597, 1202,
-// 4th loop of 2 & 3 & 4 layers
-2499, 271, 90, 853, 2312, 681, 2367,
-// 5th loop of 2 & 3 & 4 layers
-1860, 3203, 1162, 1618, 130, 1602, 3147,
-// 6th loop of 2 & 3 & 4 layers
-666, 320, 8, 2813, 1871, 829, 1752,
-// 7th loop of 2 & 3 & 4 layers
-1544, 282, 1838, 1293, 2946, 3065, 2707,
-// 8th loop of 2 & 3 & 4 layers
-2314, 552, 2677, 2106, 1325, 2756, 171,
-// 5 & 6 & 7 layers
-3127, 3042, 1907, 1836, 1517, 359, 1932,
-// 128^-1 * 2^32
-1441
-};
-
-extern void ntt_fast(int16_t *, const int16_t *);
-/*************************************************
-* Name:        ntt
-*
-* Description: Inplace number-theoretic transform (NTT) in Rq
-*              input is in standard order, output is in bitreversed order
-*
-* Arguments:   - int16_t *poly: pointer to input/output vector of 256 elements of Zq
-**************************************************/
-void ntt(int16_t *poly) {
-    ntt_fast(poly, zetas_asm);
-}
-
-extern void invntt_fast(int16_t *, const int16_t *);
-/*************************************************
-* Name:        invntt
-*
-* Description: Inplace inverse number-theoretic transform in Rq
-*              input is in bitreversed order, output is in standard order
-*
-* Arguments:   - int16_t *poly: pointer to input/output vector of 256 elements of Zq
-**************************************************/
-void invntt(int16_t *poly) {
-    invntt_fast(poly, zetas_inv_asm);
-}
diff --git a/src/kyber512/ntt.h b/src/kyber512/ntt.h
deleted file mode 100644
index a8e6b76..0000000
--- a/src/kyber512/ntt.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#ifndef NTT_H
-#define NTT_H
-
-#include <stdint.h>
-
-extern const int16_t zetas[64];
-
-void ntt(int16_t *poly);
-void invntt(int16_t *poly);
-
-#endif
diff --git a/src/kyber512/params.h b/src/kyber512/params.h
deleted file mode 100644
index be9ec45..0000000
--- a/src/kyber512/params.h
+++ /dev/null
@@ -1,32 +0,0 @@
-#ifndef PARAMS_H
-#define PARAMS_H
-
-#define KYBER_K 2 /* Change this for different security strengths */
-
-/* Don't change parameters below this line */
-
-#define KYBER_N 256
-#define KYBER_Q 3329
-
-#define KYBER_ETA1 3
-#define KYBER_ETA2 2
-
-#define KYBER_SYMBYTES 32   /* size in bytes of hashes, and seeds */
-#define KYBER_SSBYTES  32   /* size in bytes of shared key */
-
-#define KYBER_POLYBYTES              384
-#define KYBER_POLYVECBYTES           (KYBER_K * KYBER_POLYBYTES)
-
-#define KYBER_POLYCOMPRESSEDBYTES    128
-#define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 320)
-
-#define KYBER_INDCPA_MSGBYTES       KYBER_SYMBYTES
-#define KYBER_INDCPA_PUBLICKEYBYTES (KYBER_POLYVECBYTES + KYBER_SYMBYTES)
-#define KYBER_INDCPA_SECRETKEYBYTES (KYBER_POLYVECBYTES)
-#define KYBER_INDCPA_BYTES          (KYBER_POLYVECCOMPRESSEDBYTES + KYBER_POLYCOMPRESSEDBYTES)
-
-#define KYBER_PUBLICKEYBYTES  (KYBER_INDCPA_PUBLICKEYBYTES)
-#define KYBER_SECRETKEYBYTES  (KYBER_INDCPA_SECRETKEYBYTES +  KYBER_INDCPA_PUBLICKEYBYTES + 2*KYBER_SYMBYTES) /* 32 bytes of additional space to save H(pk) */
-#define KYBER_CIPHERTEXTBYTES  KYBER_INDCPA_BYTES
-
-#endif
diff --git a/src/kyber512/poly.c b/src/kyber512/poly.c
deleted file mode 100644
index 0765d76..0000000
--- a/src/kyber512/poly.c
+++ /dev/null
@@ -1,581 +0,0 @@
-#include "poly.h"
-
-#include "cbd.h"
-#include "ntt.h"
-#include "params.h"
-#include "symmetric.h"
-
-#include <stdint.h>
-
-
-/*************************************************
-* Name:        poly_compress
-*
-* Description: Serialization of a polynomial and subsequent compression of a polynomial;
-*
-* Arguments:   - unsigned char *r: pointer to output byte array (of length KYBER_POLYCOMPRESSEDBYTES)
-*              - const poly *a:    pointer to input polynomial to be serialized
-*************************************************/
-void poly_compress(unsigned char *r, poly *a)
-{
-  uint8_t t[8];
-  int i,j,k=0;
-
-#if (KYBER_POLYCOMPRESSEDBYTES == 96)
-  for(i=0;i<KYBER_N;i+=8)
-  {
-    for(j=0;j<8;j++)
-      t[j] = ((((uint32_t)a->coeffs[i+j] << 3) + KYBER_Q/2) / KYBER_Q) & 7;
-
-    r[k]   =  t[0]       | (t[1] << 3) | (t[2] << 6);
-    r[k+1] = (t[2] >> 2) | (t[3] << 1) | (t[4] << 4) | (t[5] << 7);
-    r[k+2] = (t[5] >> 1) | (t[6] << 2) | (t[7] << 5);
-    k += 3;
-  }
-#elif (KYBER_POLYCOMPRESSEDBYTES == 128)
-  for(i=0;i<KYBER_N;i+=8)
-  {
-    for(j=0;j<8;j++)
-      t[j] = ((((uint32_t)a->coeffs[i+j] << 4) + KYBER_Q/2) / KYBER_Q) & 15;
-
-    r[k]   = t[0] | (t[1] << 4);
-    r[k+1] = t[2] | (t[3] << 4);
-    r[k+2] = t[4] | (t[5] << 4);
-    r[k+3] = t[6] | (t[7] << 4);
-    k += 4;
-  }
-#elif (KYBER_POLYCOMPRESSEDBYTES == 160)
-  for(i=0;i<KYBER_N;i+=8)
-  {
-    for(j=0;j<8;j++)
-      t[j] = ((((uint32_t)a->coeffs[i+j] << 5) + KYBER_Q/2) / KYBER_Q) & 31;
-
-    r[k]   =  t[0]       | (t[1] << 5);
-    r[k+1] = (t[1] >> 3) | (t[2] << 2) | (t[3] << 7);
-    r[k+2] = (t[3] >> 1) | (t[4] << 4);
-    r[k+3] = (t[4] >> 4) | (t[5] << 1) | (t[6] << 6);
-    r[k+4] = (t[6] >> 2) | (t[7] << 3);
-    k += 5;
-  }
-#else
-#error "KYBER_POLYCOMPRESSEDBYTES needs to be in {96, 128, 160}"
-#endif
-}
-
-/*************************************************
-* Name:        poly_decompress
-*
-* Description: De-serialization and subsequent decompression of a polynomial;
-*              approximate inverse of poly_compress
-*
-* Arguments:   - poly *r:                pointer to output polynomial
-*              - const unsigned char *a: pointer to input byte array (of length KYBER_POLYCOMPRESSEDBYTES bytes)
-**************************************************/
-void poly_decompress(poly *r, const unsigned char *a)
-{
-  int i;
-#if (KYBER_POLYCOMPRESSEDBYTES == 96)
-  for(i=0;i<KYBER_N;i+=8)
-  {
-    r->coeffs[i+0] =  (((a[0] & 7) * KYBER_Q) + 4) >> 3;
-    r->coeffs[i+1] = ((((a[0] >> 3) & 7) * KYBER_Q) + 4) >> 3;
-    r->coeffs[i+2] = ((((a[0] >> 6) | ((a[1] << 2) & 4)) * KYBER_Q) + 4) >> 3;
-    r->coeffs[i+3] = ((((a[1] >> 1) & 7) * KYBER_Q) + 4) >> 3;
-    r->coeffs[i+4] = ((((a[1] >> 4) & 7) * KYBER_Q) + 4) >> 3;
-    r->coeffs[i+5] = ((((a[1] >> 7) | ((a[2] << 1) & 6)) * KYBER_Q) + 4) >> 3;
-    r->coeffs[i+6] = ((((a[2] >> 2) & 7) * KYBER_Q) + 4) >> 3;
-    r->coeffs[i+7] = ((((a[2] >> 5)) * KYBER_Q) + 4) >> 3;
-    a += 3;
-  }
-#elif (KYBER_POLYCOMPRESSEDBYTES == 128)
-  for(i=0;i<KYBER_N;i+=8)
-  {
-    r->coeffs[i+0] = (((a[0] & 15) * KYBER_Q) + 8) >> 4;
-    r->coeffs[i+1] = (((a[0] >> 4) * KYBER_Q) + 8) >> 4;
-    r->coeffs[i+2] = (((a[1] & 15) * KYBER_Q) + 8) >> 4;
-    r->coeffs[i+3] = (((a[1] >> 4) * KYBER_Q) + 8) >> 4;
-    r->coeffs[i+4] = (((a[2] & 15) * KYBER_Q) + 8) >> 4;
-    r->coeffs[i+5] = (((a[2] >> 4) * KYBER_Q) + 8) >> 4;
-    r->coeffs[i+6] = (((a[3] & 15) * KYBER_Q) + 8) >> 4;
-    r->coeffs[i+7] = (((a[3] >> 4) * KYBER_Q) + 8) >> 4;
-    a += 4;
-  }
-#elif (KYBER_POLYCOMPRESSEDBYTES == 160)
-  for(i=0;i<KYBER_N;i+=8)
-  {
-    r->coeffs[i+0] =  (((a[0] & 31) * KYBER_Q) + 16) >> 5;
-    r->coeffs[i+1] = ((((a[0] >> 5) | ((a[1] & 3) << 3)) * KYBER_Q) + 16) >> 5;
-    r->coeffs[i+2] = ((((a[1] >> 2) & 31) * KYBER_Q) + 16) >> 5;
-    r->coeffs[i+3] = ((((a[1] >> 7) | ((a[2] & 15) << 1)) * KYBER_Q) + 16) >> 5;
-    r->coeffs[i+4] = ((((a[2] >> 4) | ((a[3] &  1) << 4)) * KYBER_Q) + 16) >> 5;
-    r->coeffs[i+5] = ((((a[3] >> 1) & 31) * KYBER_Q) + 16) >> 5;
-    r->coeffs[i+6] = ((((a[3] >> 6) | ((a[4] &  7) << 2)) * KYBER_Q) + 16) >> 5;
-    r->coeffs[i+7] =  (((a[4] >> 3) * KYBER_Q) + 16) >> 5;
-    a += 5;
-  }
-#else
-#error "KYBER_POLYCOMPRESSEDBYTES needs to be in {96, 128, 160}"
-#endif
-}
-
-/*************************************************
-* Name:        poly_packcompress
-*
-* Description: Serialization and subsequent compression of a polynomial of a polyvec,
-*              writes to a byte string representation of the whole polyvec.
-*              Used to compress a polyvec one poly at a time in a loop.
-*
-* Arguments:   - unsigned char *r:  pointer to output byte string representation of a polyvec (of length KYBER_POLYVECCOMPRESSEDBYTES)
-*              - const poly *a:     pointer to input polynomial
-*              - int i:             index of to be serialized polynomial in serialized polyec
-**************************************************/
-void poly_packcompress(unsigned char *r, poly *a, int i) {
-    int j, k;
-
-#if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352))
-  uint16_t t[8];
-
-  for(j=0;j<KYBER_N/8;j++) {
-    for(k=0;k<8;k++)
-      t[k] = ((((uint32_t)a->coeffs[8*j+k] << 11) + KYBER_Q/2) / KYBER_Q) & 0x7ff;
-
-    r[352*i+11*j+ 0] =  t[0] & 0xff;
-    r[352*i+11*j+ 1] = (t[0] >>  8) | ((t[1] & 0x1f) << 3);
-    r[352*i+11*j+ 2] = (t[1] >>  5) | ((t[2] & 0x03) << 6);
-    r[352*i+11*j+ 3] = (t[2] >>  2) & 0xff;
-    r[352*i+11*j+ 4] = (t[2] >> 10) | ((t[3] & 0x7f) << 1);
-    r[352*i+11*j+ 5] = (t[3] >>  7) | ((t[4] & 0x0f) << 4);
-    r[352*i+11*j+ 6] = (t[4] >>  4) | ((t[5] & 0x01) << 7);
-    r[352*i+11*j+ 7] = (t[5] >>  1) & 0xff;
-    r[352*i+11*j+ 8] = (t[5] >>  9) | ((t[6] & 0x3f) << 2);
-    r[352*i+11*j+ 9] = (t[6] >>  6) | ((t[7] & 0x07) << 5);
-    r[352*i+11*j+10] = (t[7] >>  3);
-  }
-#elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320))
-    uint16_t t[4];
-
-    for (j = 0; j < KYBER_N / 4; j++) {
-        for (k = 0; k < 4; k++)
-            t[k] = ((((uint32_t)a->coeffs[4 * j + k] << 10) + KYBER_Q / 2) / KYBER_Q) & 0x3ff;
-
-        r[320*i+5*j+0] =   t[0] & 0xff;
-        r[320*i+5*j+1] =  (t[0] >>  8) | ((t[1] & 0x3f) << 2);
-        r[320*i+5*j+2] = ((t[1] >>  6) | ((t[2] & 0x0f) << 4)) & 0xff;
-        r[320*i+5*j+3] = ((t[2] >>  4) | ((t[3] & 0x03) << 6)) & 0xff;
-        r[320*i+5*j+4] =  (t[3] >>  2) & 0xff;
-    }
-#else
-#error "KYBER_POLYVECCOMPRESSEDBYTES needs to in (KYBER_K * {352, 320})"
-#endif
-}
-
-/*************************************************
-* Name:        poly_unpackdecompress
-*
-* Description: Deserialization and subsequent compression of a polynomial of a polyvec,
-*              Used to uncompress a polyvec one poly at a time in a loop.
-*
-* Arguments:   - const poly *r:     pointer to output polynomial
-*              - unsigned char *a:  pointer to input byte string representation of a polyvec (of length KYBER_POLYVECCOMPRESSEDBYTES)
-*              - int i:             index of poly in polyvec to decompress
-**************************************************/
-void poly_unpackdecompress(poly *r, const unsigned char *a, int i) {
-  int j;
-#if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352))
-    for(j=0;j<KYBER_N/8;j++)
-    {
-      r->coeffs[8*j+0] =  (((a[352*i+11*j+ 0]       | (((uint32_t)a[352*i+11*j+ 1] & 0x07) << 8)) * KYBER_Q) + 1024) >> 11;
-      r->coeffs[8*j+1] = ((((a[352*i+11*j+ 1] >> 3) | (((uint32_t)a[352*i+11*j+ 2] & 0x3f) << 5)) * KYBER_Q) + 1024) >> 11;
-      r->coeffs[8*j+2] = ((((a[352*i+11*j+ 2] >> 6) | (((uint32_t)a[352*i+11*j+ 3] & 0xff) << 2) | (((uint32_t)a[352*i+11*j+4] & 0x01) << 10)) * KYBER_Q) + 1024) >> 11;
-      r->coeffs[8*j+3] = ((((a[352*i+11*j+ 4] >> 1) | (((uint32_t)a[352*i+11*j+ 5] & 0x0f) << 7)) * KYBER_Q) + 1024) >> 11;
-      r->coeffs[8*j+4] = ((((a[352*i+11*j+ 5] >> 4) | (((uint32_t)a[352*i+11*j+ 6] & 0x7f) << 4)) * KYBER_Q) + 1024) >> 11;
-      r->coeffs[8*j+5] = ((((a[352*i+11*j+ 6] >> 7) | (((uint32_t)a[352*i+11*j+ 7] & 0xff) << 1) | (((uint32_t)a[352*i+11*j+8] & 0x03) <<  9)) * KYBER_Q) + 1024) >> 11;
-      r->coeffs[8*j+6] = ((((a[352*i+11*j+ 8] >> 2) | (((uint32_t)a[352*i+11*j+ 9] & 0x1f) << 6)) * KYBER_Q) + 1024) >> 11;
-      r->coeffs[8*j+7] = ((((a[352*i+11*j+ 9] >> 5) | (((uint32_t)a[352*i+11*j+10] & 0xff) << 3)) * KYBER_Q) + 1024) >> 11;
-    }
-#elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320))
-    for(j=0;j<KYBER_N/4;j++)
-    {
-      r->coeffs[4*j+0] =  (((a[320*i+5*j+ 0]       | (((uint32_t)a[320*i+5*j+ 1] & 0x03) << 8)) * KYBER_Q) + 512) >> 10;
-      r->coeffs[4*j+1] = ((((a[320*i+5*j+ 1] >> 2) | (((uint32_t)a[320*i+5*j+ 2] & 0x0f) << 6)) * KYBER_Q) + 512) >> 10;
-      r->coeffs[4*j+2] = ((((a[320*i+5*j+ 2] >> 4) | (((uint32_t)a[320*i+5*j+ 3] & 0x3f) << 4)) * KYBER_Q) + 512) >> 10;
-      r->coeffs[4*j+3] = ((((a[320*i+5*j+ 3] >> 6) | (((uint32_t)a[320*i+5*j+ 4] & 0xff) << 2)) * KYBER_Q) + 512) >> 10;
-    }
-#else
-#error "KYBER_POLYVECCOMPRESSEDBYTES needs to be in {320*KYBER_K, 352*KYBER_K}"
-#endif
-}
-
-
-/*************************************************
-* Name:        cmp_poly_compress
-*
-* Description: Serializes and consequently compares polynomial to a serialized polynomial
-*
-* Arguments:   - const unsigned char *r:    pointer to serialized polynomial to compare with
-*              - poly *a:                   pointer to input polynomial to serialize and compare
-* Returns:                                  boolean indicating whether the polynomials are equal
-**************************************************/
-int cmp_poly_compress(const unsigned char *r, poly *a) {
-    unsigned char rc = 0;
-    uint8_t t[8];
-    int i, j, k = 0;
-
-#if (KYBER_POLYCOMPRESSEDBYTES == 96)
-    for(i=0;i<KYBER_N;i+=8)
-    {
-      for(j=0;j<8;j++)
-        t[j] = ((((uint32_t)a->coeffs[i+j] << 3) + KYBER_Q/2) / KYBER_Q) & 7;
-
-      rc |= r[k]   ^ (t[0]       | (t[1] << 3) | (t[2] << 6));
-      rc |= r[k+1] ^ ((t[2] >> 2) | (t[3] << 1) | (t[4] << 4) | (t[5] << 7));
-      rc |= r[k+2] ^ ((t[5] >> 1) | (t[6] << 2) | (t[7] << 5));
-      k += 3;
-    }
-#elif (KYBER_POLYCOMPRESSEDBYTES == 128)
-    for (i = 0; i < KYBER_N; i += 8) {
-        for (j = 0; j < 8; j++)
-            t[j] = ((((uint32_t)a->coeffs[i + j] << 4) + KYBER_Q / 2) / KYBER_Q) & 15;
-
-        rc |= r[k]      ^ (t[0] | (t[1] << 4));
-        rc |= r[k + 1]  ^ (t[2] | (t[3] << 4));
-        rc |= r[k + 2]  ^ (t[4] | (t[5] << 4));
-        rc |= r[k + 3]  ^ (t[6] | (t[7] << 4));
-        k += 4;
-    }
-#elif (KYBER_POLYCOMPRESSEDBYTES == 160)
-    for(i=0;i<KYBER_N;i+=8)
-    {
-      for(j=0;j<8;j++)
-        t[j] = ((((uint32_t)a->coeffs[i+j] << 5) + KYBER_Q/2) / KYBER_Q) & 31;
-
-      rc |= r[k]   ^ (t[0]       | (t[1] << 5));
-      rc |= r[k+1] ^ ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7));
-      rc |= r[k+2] ^ ((t[3] >> 1) | (t[4] << 4));
-      rc |= r[k+3] ^ ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6));
-      rc |= r[k+4] ^ ((t[6] >> 2) | (t[7] << 3));
-      k += 5;
-    }
-#else
-#error "KYBER_POLYCOMPRESSEDBYTES needs to be in {96, 128, 160}"
-#endif
-    return rc;
-}
-
-/*************************************************
-* Name:        cmp_poly_packcompress
-*
-* Description: Serializes and consequently compares poly of polyvec to a serialized polyvec
-*              Should be called in a loop over all poly's of a polyvec.
-*
-* Arguments:   - const unsigned char *r:    pointer to serialized polyvec to compare with
-*              - poly *a:                   pointer to input polynomial of polyvec to serialize and compare
-*              - int i:                     index of poly in polyvec to compare with
-* Returns:                                  boolean indicating whether the polyvecs are equal
-**************************************************/
-int cmp_poly_packcompress(const unsigned char *r, poly *a, int i) {
-    unsigned char rc = 0;
-    int j, k;
-
-#if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352))
-  uint16_t t[8];
-    for(j=0;j<KYBER_N/8;j++)
-    {
-      for(k=0;k<8;k++)
-        t[k] = ((((uint32_t)a->coeffs[8*j+k] << 11) + KYBER_Q/2) / KYBER_Q) & 0x7ff;
-
-      rc |= r[352*i+11*j+ 0] ^ (t[0] & 0xff);
-      rc |= r[352*i+11*j+ 1] ^ ((t[0] >>  8) | ((t[1] & 0x1f) << 3));
-      rc |= r[352*i+11*j+ 2] ^ ((t[1] >>  5) | ((t[2] & 0x03) << 6));
-      rc |= r[352*i+11*j+ 3] ^ ((t[2] >>  2) & 0xff);
-      rc |= r[352*i+11*j+ 4] ^ ((t[2] >> 10) | ((t[3] & 0x7f) << 1));
-      rc |= r[352*i+11*j+ 5] ^ ((t[3] >>  7) | ((t[4] & 0x0f) << 4));
-      rc |= r[352*i+11*j+ 6] ^ ((t[4] >>  4) | ((t[5] & 0x01) << 7));
-      rc |= r[352*i+11*j+ 7] ^ ((t[5] >>  1) & 0xff);
-      rc |= r[352*i+11*j+ 8] ^ ((t[5] >>  9) | ((t[6] & 0x3f) << 2));
-      rc |= r[352*i+11*j+ 9] ^ ((t[6] >>  6) | ((t[7] & 0x07) << 5));
-      rc |= r[352*i+11*j+10] ^ ((t[7] >>  3));
-    }
-#elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320))
-    uint16_t t[4];
-        for (j = 0; j < KYBER_N / 4; j++) {
-            for (k = 0; k < 4; k++)
-                t[k] = ((((uint32_t)a->coeffs[4 * j + k] << 10) + KYBER_Q / 2) / KYBER_Q) & 0x3ff;
-
-            rc |= r[320*i+5*j+0] ^ (t[0] & 0xff);
-            rc |= r[320*i+5*j+1] ^ ((t[0] >>  8) | ((t[1] & 0x3f) << 2));
-            rc |= r[320*i+5*j+2] ^ (((t[1] >>  6) | ((t[2] & 0x0f) << 4)) & 0xff);
-            rc |= r[320*i+5*j+3] ^ (((t[2] >>  4) | ((t[3] & 0x03) << 6)) & 0xff);
-            rc |= r[320*i+5*j+4] ^ ((t[3] >>  2) & 0xff);
-        }
-#else
-#error "KYBER_POLYVECCOMPRESSEDBYTES needs to be in {320*KYBER_K, 352*KYBER_K}"
-#endif
-    return rc;
-}
-
-/*************************************************
-* Name:        poly_tobytes
-*
-* Description: Serialization of a polynomial
-*
-* Arguments:   - unsigned char *r: pointer to output byte array (needs space for KYBER_POLYBYTES bytes)
-*              - const poly *a:    pointer to input polynomial
-**************************************************/
-void poly_tobytes(unsigned char *r, poly *a) {
-    int i;
-    uint16_t t0, t1;
-
-    poly_reduce(a);
-
-    for (i = 0; i < KYBER_N / 2; i++) {
-        t0 = a->coeffs[2 * i];
-        t1 = a->coeffs[2 * i + 1];
-        r[3 * i] = t0 & 0xff;
-        r[3 * i + 1] = (t0 >> 8) | ((t1 & 0xf) << 4);
-        r[3 * i + 2] = (t1 >> 4) & 0xff;
-    }
-}
-
-/*************************************************
-* Name:        poly_frombytes
-*
-* Description: De-serialization of a polynomial;
-*              inverse of poly_tobytes
-*
-* Arguments:   - poly *r:                pointer to output polynomial
-*              - const unsigned char *a: pointer to input byte array (of KYBER_POLYBYTES bytes)
-**************************************************/
-void poly_frombytes(poly *r, const unsigned char *a) {
-    int i;
-
-    for (i = 0; i < KYBER_N / 2; i++) {
-        r->coeffs[2 * i]     = a[3 * i]          | ((uint16_t)a[3 * i + 1] & 0x0f) << 8;
-        r->coeffs[2 * i + 1] = a[3 * i + 1] >> 4 | ((uint16_t)a[3 * i + 2] & 0xff) << 4;
-    }
-}
-
-
-extern void doublebasemul_asm(int16_t *r, const int16_t *a, const int16_t *b, int16_t zeta);
-/*************************************************
-* Name:        poly_frombytes_mul
-*
-* Description: Multiplication of a polynomial with a de-serialization of another polynomial
-*
-* Arguments:   - poly *r:                pointer to output polynomial
-*              - const unsigned char *a: pointer to input byte array (of KYBER_POLYBYTES bytes)
-**************************************************/
-void poly_frombytes_mul(poly *r, const unsigned char *a) {
-    int16_t tmp[4];
-    int i;
-
-    for (i = 0; i < KYBER_N / 4; i++) {
-        tmp[0] = a[6 * i]          | ((uint16_t)a[6 * i + 1] & 0x0f) << 8;
-        tmp[1] = a[6 * i + 1] >> 4 | ((uint16_t)a[6 * i + 2] & 0xff) << 4;
-        tmp[2] = a[6 * i + 3]      | ((uint16_t)a[6 * i + 4] & 0x0f) << 8;
-        tmp[3] = a[6 * i + 4] >> 4 | ((uint16_t)a[6 * i + 5] & 0xff) << 4;
-
-        doublebasemul_asm(&r->coeffs[4*i], &r->coeffs[4*i], tmp, zetas[i]);
-    }
-}
-
-/*************************************************
-* Name:        poly_getnoise_eta1
-*
-* Description: Sample a polynomial deterministically from a seed and a nonce,
-*              with output polynomial close to centered binomial distribution
-*              with parameter KYBER_ETA1
-*
-* Arguments:   - poly *r:                   pointer to output polynomial
-*              - const unsigned char *seed: pointer to input seed (pointing to array of length KYBER_SYMBYTES bytes)
-*              - unsigned char nonce:       one-byte input nonce
-*              - int add:                   boolean to indicate to accumulate into r
-**************************************************/
-void poly_noise_eta1(poly *r, const unsigned char *seed, unsigned char nonce, int add) {
-    unsigned char buf[KYBER_ETA1 * KYBER_N / 4];
-
-    prf(buf, KYBER_ETA1 * KYBER_N / 4, seed, nonce);
-    cbd_eta1(r, buf, add);
-}
-
-/*************************************************
-* Name:        poly_getnoise_eta2
-*
-* Description: Sample a polynomial deterministically from a seed and a nonce,
-*              with output polynomial close to centered binomial distribution
-*              with parameter KYBER_ETA2
-*
-* Arguments:   - poly *r:                   pointer to output polynomial
-*              - const unsigned char *seed: pointer to input seed (pointing to array of length KYBER_SYMBYTES bytes)
-*              - unsigned char nonce:       one-byte input nonce
-*              - int add:                   boolean to indicate to accumulate into r
-**************************************************/
-void poly_noise_eta2(poly *r, const unsigned char *seed, unsigned char nonce, int add) {
-    unsigned char buf[KYBER_ETA2 * KYBER_N / 4];
-
-    prf(buf, KYBER_ETA2 * KYBER_N / 4, seed, nonce);
-    cbd_eta2(r, buf, add);
-}
-
-/*************************************************
-* Name:        poly_ntt
-*
-* Description: Computes negacyclic number-theoretic transform (NTT) of
-*              a polynomial in place;
-*              inputs assumed to be in normal order, output in bitreversed order
-*
-* Arguments:   - uint16_t *r: pointer to in/output polynomial
-**************************************************/
-void poly_ntt(poly *r) {
-    ntt(r->coeffs);
-}
-
-/*************************************************
-* Name:        poly_invntt
-*
-* Description: Computes inverse of negacyclic number-theoretic transform (NTT) of
-*              a polynomial in place;
-*              inputs assumed to be in bitreversed order, output in normal order
-*
-* Arguments:   - uint16_t *a: pointer to in/output polynomial
-**************************************************/
-void poly_invntt(poly *r) {
-    invntt(r->coeffs);
-}
-
-extern void basemul_asm(int16_t *, const int16_t *, const int16_t *, const int16_t *);
-/*************************************************
-* Name:        poly_basemul
-*
-* Description: Multiplication of two polynomials in NTT domain
-*
-* Arguments:   - poly *r:       pointer to output polynomial
-*              - const poly *a: pointer to first input polynomial
-*              - const poly *b: pointer to second input polynomial
-**************************************************/
-void poly_basemul(poly *r, const poly *a, const poly *b) {
-    basemul_asm(r->coeffs, a->coeffs, b->coeffs, zetas);
-}
-
-extern void basemul_asm_acc(int16_t *, const int16_t *, const int16_t *, const int16_t *);
-/*************************************************
-* Name:        poly_basemul_acc
-*
-* Description: Multiplication of two polynomials in NTT domain, accumulating
-*
-* Arguments:   - poly *r:       pointer to output polynomial
-*              - const poly *a: pointer to first input polynomial
-*              - const poly *b: pointer to second input polynomial
-**************************************************/
-void poly_basemul_acc(poly *r, const poly *a, const poly *b) {
-    basemul_asm_acc(r->coeffs, a->coeffs, b->coeffs, zetas);
-}
-
-extern void asm_frommont(int16_t *r);
-/*************************************************
-* Name:        poly_frommont
-*
-* Description: Inplace conversion of all coefficients of a polynomial
-*              from Montgomery domain to normal domain
-*
-* Arguments:   - poly *r:       pointer to input/output polynomial
-**************************************************/
-void poly_frommont(poly *r) {
-  asm_frommont(r->coeffs);
-}
-
-extern void asm_barrett_reduce(int16_t *r);
-/*************************************************
-* Name:        poly_reduce
-*
-* Description: Applies Barrett reduction to all coefficients of a polynomial
-*              for details of the Barrett reduction see comments in reduce.c
-*
-* Arguments:   - poly *r:       pointer to input/output polynomial
-**************************************************/
-void poly_reduce(poly *r) {
-  asm_barrett_reduce(r->coeffs);
-}
-
-extern void pointwise_add(int16_t *, const int16_t *, const int16_t *);
-/*************************************************
-* Name:        poly_add
-*
-* Description: Add two polynomials
-*
-* Arguments: - poly *r:       pointer to output polynomial
-*            - const poly *a: pointer to first input polynomial
-*            - const poly *b: pointer to second input polynomial
-**************************************************/
-void poly_add(poly *r, const poly *a, const poly *b) {
-    pointwise_add(r->coeffs,a->coeffs,b->coeffs);
-}
-
-
-extern void pointwise_sub(int16_t *, const int16_t *, const int16_t *);
-/*************************************************
-* Name:        poly_sub
-*
-* Description: Subtract two polynomials
-*
-* Arguments: - poly *r:       pointer to output polynomial
-*            - const poly *a: pointer to first input polynomial
-*            - const poly *b: pointer to second input polynomial
-**************************************************/
-void poly_sub(poly *r, const poly *a, const poly *b) {
-    pointwise_sub(r->coeffs,a->coeffs,b->coeffs);
-}
-
-/*************************************************
-* Name:        poly_frommsg
-*
-* Description: Convert 32-byte message to polynomial
-*
-* Arguments:   - poly *r:                  pointer to output polynomial
-*              - const unsigned char *msg: pointer to input message
-**************************************************/
-void poly_frommsg(poly *r, const unsigned char msg[KYBER_SYMBYTES]) {
-    int i, j;
-    uint16_t mask;
-
-    for (i = 0; i < KYBER_SYMBYTES; i++) {
-        for (j = 0; j < 8; j++) {
-            mask = -((msg[i] >> j) & 1);
-            r->coeffs[8 * i + j] = mask & ((KYBER_Q + 1) / 2);
-        }
-    }
-}
-
-/*************************************************
-* Name:        poly_tomsg
-*
-* Description: Convert polynomial to 32-byte message
-*
-* Arguments:   - unsigned char *msg: pointer to output message
-*              - const poly *a:      pointer to input polynomial
-**************************************************/
-void poly_tomsg(unsigned char msg[KYBER_SYMBYTES], poly *a) {
-    uint16_t t;
-    int i, j;
-
-    for (i = 0; i < KYBER_SYMBYTES; i++) {
-        msg[i] = 0;
-        for (j = 0; j < 8; j++) {
-            t = (((a->coeffs[8 * i + j] << 1) + KYBER_Q / 2) / KYBER_Q) & 1;
-            msg[i] |= t << j;
-        }
-    }
-}
-
-/*************************************************
-* Name:        poly_zeroize
-*
-* Description: Zeros a polynomial
-*
-* Arguments:   - poly *p: pointer to polynomial
-**************************************************/
-void poly_zeroize(poly *p) {
-  int i;
-  for(i = 0; i < KYBER_N; i++)
-   p->coeffs[i] = 0;
-}
diff --git a/src/kyber512/poly.h b/src/kyber512/poly.h
deleted file mode 100644
index 6e3a32b..0000000
--- a/src/kyber512/poly.h
+++ /dev/null
@@ -1,53 +0,0 @@
-#ifndef POLY_H
-#define POLY_H
-
-#include "params.h"
-
-#include <stdint.h>
-
-#define poly_getnoise_eta1(p, seed, nonce) poly_noise_eta1(p, seed, nonce, 0)
-#define poly_getnoise_eta2(p, seed, nonce) poly_noise_eta2(p, seed, nonce, 0)
-#define poly_addnoise_eta1(p, seed, nonce) poly_noise_eta1(p, seed, nonce, 1)
-#define poly_addnoise_eta2(p, seed, nonce) poly_noise_eta2(p, seed, nonce, 1)
-
-/*
- * Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial
- * coeffs[0] + X*coeffs[1] + X^2*xoeffs[2] + ... + X^{n-1}*coeffs[n-1]
- */
-typedef struct {
-    int16_t coeffs[KYBER_N];
-} poly;
-
-void poly_compress(unsigned char *r, poly *a);
-void poly_decompress(poly *r, const unsigned char *a);
-
-void poly_packcompress(unsigned char *r, poly *a, int i);
-void poly_unpackdecompress(poly *r, const unsigned char *a, int i);
-
-int cmp_poly_compress(const unsigned char *r, poly *a);
-int cmp_poly_packcompress(const unsigned char *r, poly *a, int i);
-
-void poly_tobytes(unsigned char *r, poly *a);
-void poly_frombytes(poly *r, const unsigned char *a);
-void poly_frombytes_mul(poly *r, const unsigned char *a);
-
-void poly_frommsg(poly *r, const unsigned char msg[KYBER_SYMBYTES]);
-void poly_tomsg(unsigned char msg[KYBER_SYMBYTES], poly *a);
-
-void poly_noise_eta1(poly *r, const unsigned char *seed, unsigned char nonce, int add);
-void poly_noise_eta2(poly *r, const unsigned char *seed, unsigned char nonce, int add);
-
-void poly_ntt(poly *r);
-void poly_invntt(poly *r);
-void poly_basemul(poly *r, const poly *a, const poly *b);
-void poly_basemul_acc(poly *r, const poly *a, const poly *b);
-void poly_frommont(poly *r);
-
-void poly_reduce(poly *r);
-
-void poly_add(poly *r, const poly *a, const poly *b);
-void poly_sub(poly *r, const poly *a, const poly *b);
-
-void poly_zeroize(poly *p);
-
-#endif
diff --git a/src/kyber512/polyvec.c b/src/kyber512/polyvec.c
deleted file mode 100644
index b1e387a..0000000
--- a/src/kyber512/polyvec.c
+++ /dev/null
@@ -1,196 +0,0 @@
-#include <stdint.h>
-#include "polyvec.h"
-#include "poly.h"
-
-/*************************************************
-* Name:        polyvec_compress
-*
-* Description: Compress and serialize vector of polynomials
-*
-* Arguments:   - unsigned char *r: pointer to output byte array (needs space for KYBER_POLYVECCOMPRESSEDBYTES)
-*              - const polyvec *a: pointer to input vector of polynomials
-**************************************************/
-void polyvec_compress(unsigned char *r, polyvec *a)
-{
-  int i,j,k;
-
-#if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352))
-  uint16_t t[8];
-  for(i=0;i<KYBER_K;i++)
-  {
-    for(j=0;j<KYBER_N/8;j++)
-    {
-      for(k=0;k<8;k++)
-        t[k] = ((((uint32_t)a->vec[i].coeffs[8*j+k] << 11) + KYBER_Q/2) / KYBER_Q) & 0x7ff;
-
-      r[11*j+ 0] =  t[0] & 0xff;
-      r[11*j+ 1] = (t[0] >>  8) | ((t[1] & 0x1f) << 3);
-      r[11*j+ 2] = (t[1] >>  5) | ((t[2] & 0x03) << 6);
-      r[11*j+ 3] = (t[2] >>  2) & 0xff;
-      r[11*j+ 4] = (t[2] >> 10) | ((t[3] & 0x7f) << 1);
-      r[11*j+ 5] = (t[3] >>  7) | ((t[4] & 0x0f) << 4);
-      r[11*j+ 6] = (t[4] >>  4) | ((t[5] & 0x01) << 7);
-      r[11*j+ 7] = (t[5] >>  1) & 0xff;
-      r[11*j+ 8] = (t[5] >>  9) | ((t[6] & 0x3f) << 2);
-      r[11*j+ 9] = (t[6] >>  6) | ((t[7] & 0x07) << 5);
-      r[11*j+10] = (t[7] >>  3);
-    }
-    r += 352;
-  }
-#elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320))
-  uint16_t t[4];
-  for(i=0;i<KYBER_K;i++)
-  {
-    for(j=0;j<KYBER_N/4;j++)
-    {
-      for(k=0;k<4;k++)
-        t[k] = ((((uint32_t)a->vec[i].coeffs[4*j+k] << 10) + KYBER_Q/2) / KYBER_Q) & 0x3ff;
-
-      r[5*j+ 0] =  t[0] & 0xff;
-      r[5*j+ 1] = (t[0] >>  8) | ((t[1] & 0x3f) << 2);
-      r[5*j+ 2] = (t[1] >>  6) | ((t[2] & 0x0f) << 4);
-      r[5*j+ 3] = (t[2] >>  4) | ((t[3] & 0x03) << 6);
-      r[5*j+ 4] = (t[3] >>  2);
-    }
-    r += 320;
-  }
-#else
-#error "KYBER_POLYVECCOMPRESSEDBYTES needs to be in {320*KYBER_K, 352*KYBER_K}"
-#endif
-}
-
-/*************************************************
-* Name:        polyvec_decompress
-*
-* Description: De-serialize and decompress vector of polynomials;
-*              approximate inverse of polyvec_compress
-*
-* Arguments:   - polyvec *r:       pointer to output vector of polynomials
-*              - unsigned char *a: pointer to input byte array (of length KYBER_POLYVECCOMPRESSEDBYTES)
-**************************************************/
-void polyvec_decompress(polyvec *r, const unsigned char *a)
-{
-  int i,j;
-#if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352))
-  for(i=0;i<KYBER_K;i++)
-  {
-    for(j=0;j<KYBER_N/8;j++)
-    {
-      r->vec[i].coeffs[8*j+0] =  (((a[11*j+ 0]       | (((uint32_t)a[11*j+ 1] & 0x07) << 8)) * KYBER_Q) + 1024) >> 11;
-      r->vec[i].coeffs[8*j+1] = ((((a[11*j+ 1] >> 3) | (((uint32_t)a[11*j+ 2] & 0x3f) << 5)) * KYBER_Q) + 1024) >> 11;
-      r->vec[i].coeffs[8*j+2] = ((((a[11*j+ 2] >> 6) | (((uint32_t)a[11*j+ 3] & 0xff) << 2) | (((uint32_t)a[11*j+ 4] & 0x01) << 10)) * KYBER_Q) + 1024) >> 11;
-      r->vec[i].coeffs[8*j+3] = ((((a[11*j+ 4] >> 1) | (((uint32_t)a[11*j+ 5] & 0x0f) << 7)) * KYBER_Q) + 1024) >> 11;
-      r->vec[i].coeffs[8*j+4] = ((((a[11*j+ 5] >> 4) | (((uint32_t)a[11*j+ 6] & 0x7f) << 4)) * KYBER_Q) + 1024) >> 11;
-      r->vec[i].coeffs[8*j+5] = ((((a[11*j+ 6] >> 7) | (((uint32_t)a[11*j+ 7] & 0xff) << 1) | (((uint32_t)a[11*j+ 8] & 0x03) <<  9)) * KYBER_Q) + 1024) >> 11;
-      r->vec[i].coeffs[8*j+6] = ((((a[11*j+ 8] >> 2) | (((uint32_t)a[11*j+ 9] & 0x1f) << 6)) * KYBER_Q) + 1024) >> 11;
-      r->vec[i].coeffs[8*j+7] = ((((a[11*j+ 9] >> 5) | (((uint32_t)a[11*j+10] & 0xff) << 3)) * KYBER_Q) + 1024) >> 11;
-    }
-    a += 352;
-  }
-#elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320))
-  for(i=0;i<KYBER_K;i++)
-  {
-    for(j=0;j<KYBER_N/4;j++)
-    {
-      r->vec[i].coeffs[4*j+0] =  (((a[5*j+ 0]       | (((uint32_t)a[5*j+ 1] & 0x03) << 8)) * KYBER_Q) + 512) >> 10;
-      r->vec[i].coeffs[4*j+1] = ((((a[5*j+ 1] >> 2) | (((uint32_t)a[5*j+ 2] & 0x0f) << 6)) * KYBER_Q) + 512) >> 10;
-      r->vec[i].coeffs[4*j+2] = ((((a[5*j+ 2] >> 4) | (((uint32_t)a[5*j+ 3] & 0x3f) << 4)) * KYBER_Q) + 512) >> 10;
-      r->vec[i].coeffs[4*j+3] = ((((a[5*j+ 3] >> 6) | (((uint32_t)a[5*j+ 4] & 0xff) << 2)) * KYBER_Q) + 512) >> 10;
-    }
-    a += 320;
-  }
-#else
-#error "KYBER_POLYVECCOMPRESSEDBYTES needs to be in {320*KYBER_K, 352*KYBER_K}"
-#endif
-}
-
-/*************************************************
-* Name:        polyvec_tobytes
-*
-* Description: Serialize vector of polynomials
-*
-* Arguments:   - unsigned char *r: pointer to output byte array (needs space for KYBER_POLYVECBYTES)
-*              - const polyvec *a: pointer to input vector of polynomials
-**************************************************/
-void polyvec_tobytes(unsigned char *r, polyvec *a)
-{
-  int i;
-  for(i=0;i<KYBER_K;i++)
-    poly_tobytes(r+i*KYBER_POLYBYTES, &a->vec[i]);
-}
-
-/*************************************************
-* Name:        polyvec_frombytes
-*
-* Description: De-serialize vector of polynomials;
-*              inverse of polyvec_tobytes
-*
-* Arguments:   - unsigned char *r: pointer to output byte array
-*              - const polyvec *a: pointer to input vector of polynomials (of length KYBER_POLYVECBYTES)
-**************************************************/
-void polyvec_frombytes(polyvec *r, const unsigned char *a)
-{
-  int i;
-  for(i=0;i<KYBER_K;i++)
-    poly_frombytes(&r->vec[i], a+i*KYBER_POLYBYTES);
-}
-
-/*************************************************
-* Name:        polyvec_ntt
-*
-* Description: Apply forward NTT to all elements of a vector of polynomials
-*
-* Arguments:   - polyvec *r: pointer to in/output vector of polynomials
-**************************************************/
-void polyvec_ntt(polyvec *r)
-{
-  int i;
-  for(i=0;i<KYBER_K;i++)
-    poly_ntt(&r->vec[i]);
-}
-
-/*************************************************
-* Name:        polyvec_invntt
-*
-* Description: Apply inverse NTT to all elements of a vector of polynomials
-*
-* Arguments:   - polyvec *r: pointer to in/output vector of polynomials
-**************************************************/
-void polyvec_invntt(polyvec *r)
-{
-  int i;
-  for(i=0;i<KYBER_K;i++)
-    poly_invntt(&r->vec[i]);
-}
-
-/*************************************************
-* Name:        polyvec_reduce
-*
-* Description: Applies Barrett reduction to each coefficient
-*              of each element of a vector of polynomials
-*              for details of the Barrett reduction see comments in reduce.c
-*
-* Arguments:   - poly *r:       pointer to input/output polynomial
-**************************************************/
-void polyvec_reduce(polyvec *r)
-{
-  int i;
-  for(i=0;i<KYBER_K;i++)
-    poly_reduce(&r->vec[i]);
-}
-
-/*************************************************
-* Name:        polyvec_add
-*
-* Description: Add vectors of polynomials
-*
-* Arguments: - polyvec *r:       pointer to output vector of polynomials
-*            - const polyvec *a: pointer to first input vector of polynomials
-*            - const polyvec *b: pointer to second input vector of polynomials
-**************************************************/
-void polyvec_add(polyvec *r, const polyvec *a, const polyvec *b)
-{
-  int i;
-  for(i=0;i<KYBER_K;i++)
-    poly_add(&r->vec[i], &a->vec[i], &b->vec[i]);
-}
diff --git a/src/kyber512/polyvec.h b/src/kyber512/polyvec.h
deleted file mode 100644
index 2271305..0000000
--- a/src/kyber512/polyvec.h
+++ /dev/null
@@ -1,24 +0,0 @@
-#ifndef POLYVEC_H
-#define POLYVEC_H
-
-#include "params.h"
-#include "poly.h"
-
-typedef struct {
-    poly vec[KYBER_K];
-} polyvec;
-
-void polyvec_compress(unsigned char *r, polyvec *a);
-void polyvec_decompress(polyvec *r, const unsigned char *a);
-
-void polyvec_tobytes(unsigned char *r, polyvec *a);
-void polyvec_frombytes(polyvec *r, const unsigned char *a);
-
-void polyvec_ntt(polyvec *r);
-void polyvec_invntt(polyvec *r);
-
-void polyvec_reduce(polyvec *r);
-
-void polyvec_add(polyvec *r, const polyvec *a, const polyvec *b);
-
-#endif
diff --git a/src/kyber512/randombytes.h b/src/kyber512/randombytes.h
deleted file mode 100644
index 2c1f511..0000000
--- a/src/kyber512/randombytes.h
+++ /dev/null
@@ -1,2 +0,0 @@
-#pragma once
-#include "../rng.h"
diff --git a/src/kyber512/reduce.S b/src/kyber512/reduce.S
deleted file mode 100644
index e98ed33..0000000
--- a/src/kyber512/reduce.S
+++ /dev/null
@@ -1,105 +0,0 @@
-#include "macros.i"
-
-.syntax unified
-.cpu cortex-m4
-.thumb
-
-.global asm_barrett_reduce
-.type asm_barrett_reduce,%function
-.align 2
-asm_barrett_reduce:
-  push    {r4-r11, r14}
-
-  poly        .req r0
-  poly0       .req r1
-  poly1       .req r2
-  poly2       .req r3
-  poly3       .req r4
-  poly4       .req r5
-  poly5       .req r6
-  poly6       .req r7
-  poly7       .req r8
-  loop        .req r9
-  barrettconst .req r10
-  q           .req r11
-  tmp         .req r12
-  tmp2        .req r14
-
-  movw barrettconst, #20159
-  movw q, #3329
-
-  movw loop, #16
-  1:
-    ldm poly, {poly0-poly7}
-
-    doublebarrett poly0, tmp, tmp2, q, barrettconst
-    doublebarrett poly1, tmp, tmp2, q, barrettconst
-    doublebarrett poly2, tmp, tmp2, q, barrettconst
-    doublebarrett poly3, tmp, tmp2, q, barrettconst
-    doublebarrett poly4, tmp, tmp2, q, barrettconst
-    doublebarrett poly5, tmp, tmp2, q, barrettconst
-    doublebarrett poly6, tmp, tmp2, q, barrettconst
-    doublebarrett poly7, tmp, tmp2, q, barrettconst
-
-    stm poly!, {poly0-poly7}
-
-    subs.w loop, #1
-  bne.w 1b
-
-  pop     {r4-r11, pc}
-
-
-.macro from_mont q, qinv, a, c, tmp, tmp2
-  smulbb \tmp, \a, \c
-  montgomery \q, \qinv, \tmp, \tmp2
-  smultb \a, \a, \c
-  montgomery \q, \qinv, \a, \tmp
-  pkhtb \a, \tmp, \tmp2, asr#16
-.endm
-
-.global asm_frommont
-.type asm_frommont,%function
-.align 2
-asm_frommont:
-  push    {r4-r11, r14}
-
-  poly        .req r0
-  poly0       .req r1
-  poly1       .req r2
-  poly2       .req r3
-  poly3       .req r4
-  poly4       .req r5
-  poly5       .req r6
-  poly6       .req r7
-  poly7       .req r8
-  loop        .req r9
-  constant    .req r10
-  qinv        .req r11
-  q           .req r11
-  tmp         .req r12
-  tmp2        .req r14
-
-  movw q, #3329
-  movt qinv, #3327
-
-  movw constant, #1353
-
-  movw loop, #16
-  1:
-    ldm poly, {poly0-poly7}
-
-    from_mont q, qinv, poly0, constant, tmp, tmp2
-    from_mont q, qinv, poly1, constant, tmp, tmp2
-    from_mont q, qinv, poly2, constant, tmp, tmp2
-    from_mont q, qinv, poly3, constant, tmp, tmp2
-    from_mont q, qinv, poly4, constant, tmp, tmp2
-    from_mont q, qinv, poly5, constant, tmp, tmp2
-    from_mont q, qinv, poly6, constant, tmp, tmp2
-    from_mont q, qinv, poly7, constant, tmp, tmp2
-
-    stm poly!, {poly0-poly7}
-
-    subs.w loop, #1
-  bne.w 1b
-
-  pop     {r4-r11, pc}
diff --git a/src/kyber512/symmetric-fips202.c b/src/kyber512/symmetric-fips202.c
deleted file mode 100644
index 311d33f..0000000
--- a/src/kyber512/symmetric-fips202.c
+++ /dev/null
@@ -1,64 +0,0 @@
-#include "fips202.h"
-#include "symmetric.h"
-
-#include <stdlib.h>
-
-/*************************************************
-* Name:        kyber_shake128_absorb
-*
-* Description: Absorb step of the SHAKE128 specialized for the Kyber context.
-*
-* Arguments:   - shake128ctx *s:                  pointer to (uninitialized) output Keccak state
-*              - const unsigned char *input:      pointer to KYBER_SYMBYTES input to be absorbed into s
-*              - unsigned char i                  additional byte of input
-*              - unsigned char j                  additional byte of input
-**************************************************/
-void kyber_shake128_absorb(shake128ctx *s, const unsigned char *input, unsigned char x, unsigned char y) {
-    unsigned char extseed[KYBER_SYMBYTES + 2];
-    int i;
-
-    for (i = 0; i < KYBER_SYMBYTES; i++) {
-        extseed[i] = input[i];
-    }
-    extseed[i++] = x;
-    extseed[i]   = y;
-    shake128_absorb(s, extseed, KYBER_SYMBYTES + 2);
-}
-
-/*************************************************
-* Name:        kyber_shake128_squeezeblocks
-*
-* Description: Squeeze step of SHAKE128 XOF. Squeezes full blocks of SHAKE128_RATE bytes each.
-*              Modifies the state. Can be called multiple times to keep squeezing,
-*              i.e., is incremental.
-*
-* Arguments:   - unsigned char *output:      pointer to output blocks
-*              - size_t nblocks:             number of blocks to be squeezed (written to output)
-*              - shake128ctx *s:            pointer to in/output Keccak state
-**************************************************/
-void kyber_shake128_squeezeblocks(unsigned char *output, size_t nblocks, shake128ctx *s) {
-    shake128_squeezeblocks(output, nblocks, s);
-}
-
-/*************************************************
-* Name:        shake256_prf
-*
-* Description: Usage of SHAKE256 as a PRF, concatenates secret and public input
-*              and then generates outlen bytes of SHAKE256 output
-*
-* Arguments:   - unsigned char *output:      pointer to output
-*              - size_t outlen:              number of requested output bytes
-*              - const unsigned char * key:  pointer to the key (of length KYBER_SYMBYTES)
-*              - const unsigned char nonce:  single-byte nonce (public PRF input)
-**************************************************/
-void shake256_prf(unsigned char *output, size_t outlen, const unsigned char *key, unsigned char nonce) {
-    unsigned char extkey[KYBER_SYMBYTES + 1];
-    size_t i;
-
-    for (i = 0; i < KYBER_SYMBYTES; i++) {
-        extkey[i] = key[i];
-    }
-    extkey[i] = nonce;
-
-    shake256(output, outlen, extkey, KYBER_SYMBYTES + 1);
-}
diff --git a/src/kyber512/symmetric.h b/src/kyber512/symmetric.h
deleted file mode 100644
index d396466..0000000
--- a/src/kyber512/symmetric.h
+++ /dev/null
@@ -1,23 +0,0 @@
-#ifndef SYMMETRIC_H
-#define SYMMETRIC_H
-
-#include "fips202.h"
-#include "params.h"
-#include <stddef.h>
-
-void kyber_shake128_absorb(shake128ctx *s, const unsigned char *input, unsigned char x, unsigned char y);
-void kyber_shake128_squeezeblocks(unsigned char *output, size_t nblocks, shake128ctx *s);
-void shake256_prf(unsigned char *output, size_t outlen, const unsigned char *key, unsigned char nonce);
-
-#define hash_h(OUT, IN, INBYTES) sha3_256(OUT, IN, INBYTES)
-#define hash_g(OUT, IN, INBYTES) sha3_512(OUT, IN, INBYTES)
-#define xof_absorb(STATE, IN, X, Y) kyber_shake128_absorb(STATE, IN, X, Y)
-#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) kyber_shake128_squeezeblocks(OUT, OUTBLOCKS, STATE)
-#define prf(OUT, OUTBYTES, KEY, NONCE) shake256_prf(OUT, OUTBYTES, KEY, NONCE)
-#define kdf(OUT, IN, INBYTES) shake256(OUT, KYBER_SSBYTES, IN, INBYTES)
-
-#define XOF_BLOCKBYTES 168
-
-typedef shake128ctx xof_state;
-
-#endif /* SYMMETRIC_H */
diff --git a/src/kyber512/verify.c b/src/kyber512/verify.c
deleted file mode 100644
index 679ec89..0000000
--- a/src/kyber512/verify.c
+++ /dev/null
@@ -1,51 +0,0 @@
-#include "verify.h"
-
-#include <stdint.h>
-#include <stdlib.h>
-
-/*************************************************
-* Name:        verify
-*
-* Description: Compare two arrays for equality in constant time.
-*
-* Arguments:   const unsigned char *a: pointer to first byte array
-*              const unsigned char *b: pointer to second byte array
-*              size_t len:             length of the byte arrays
-*
-* Returns 0 if the byte arrays are equal, 1 otherwise
-**************************************************/
-unsigned char verify(const unsigned char *a, const unsigned char *b, size_t len) {
-    uint64_t r;
-    size_t i;
-
-    r = 0;
-    for (i = 0; i < len; i++) {
-        r |= a[i] ^ b[i];
-    }
-
-    r = (~r + 1); // Two's complement
-    r >>= 63;
-    return (unsigned char)r;
-}
-
-/*************************************************
-* Name:        cmov
-*
-* Description: Copy len bytes from x to r if b is 1;
-*              don't modify x if b is 0. Requires b to be in {0,1};
-*              assumes two's complement representation of negative integers.
-*              Runs in constant time.
-*
-* Arguments:   unsigned char *r:       pointer to output byte array
-*              const unsigned char *x: pointer to input byte array
-*              size_t len:             Amount of bytes to be copied
-*              unsigned char b:        Condition bit; has to be in {0,1}
-**************************************************/
-void cmov(unsigned char *r, const unsigned char *x, size_t len, unsigned char b) {
-    size_t i;
-
-    b = -b;
-    for (i = 0; i < len; i++) {
-        r[i] ^= b & (x[i] ^ r[i]);
-    }
-}
diff --git a/src/kyber512/verify.h b/src/kyber512/verify.h
deleted file mode 100644
index 8777a14..0000000
--- a/src/kyber512/verify.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef VERIFY_H
-#define VERIFY_H
-
-#include <stdio.h>
-
-unsigned char verify(const unsigned char *a, const unsigned char *b, size_t len);
-
-void cmov(unsigned char *r, const unsigned char *x, size_t len, unsigned char b);
-
-#endif
diff --git a/src/kyber512/wrapper.c b/src/kyber512/wrapper.c
deleted file mode 100644
index ce54117..0000000
--- a/src/kyber512/wrapper.c
+++ /dev/null
@@ -1,48 +0,0 @@
-#include "./wrapper.h"
-#include "./params.h"
-#include "./api.h"
-
-#if KYBER512_PUBLIC_KEY_SIZE != KYBER_PUBLICKEYBYTES
-#error invalid public key size, update me!
-#endif
-#if KYBER512_PRIVATE_KEY_SIZE != KYBER_SECRETKEYBYTES
-#error invalid private key size, update me!
-#endif
-#if KYBER512_SHARED_SECRET_SIZE != KYBER_SSBYTES
-#error invalid shared secret size, update me!
-#endif
-#if KYBER512_CIPHERTEXT_SIZE != KYBER_CIPHERTEXTBYTES
-#error invalid key encapsulation message size, update me!
-#endif
-
-uint8_t* Kyber512State_getPrivateKey(Kyber512State* self) {
-	return self->m_privateKey;
-}
-
-uint8_t* Kyber512State_getPublicKey(Kyber512State* self) {
-	return self->m_publicKey;
-}
-
-uint8_t* Kyber512State_getSharedSecretBuffer(Kyber512State* self) {
-	return self->m_sharedSecretBuffer;
-}
-
-uint8_t* Kyber512State_getKeyEncapsulationMessageBuffer(Kyber512State* self) {
-	return self->m_keyEncapsulationMessageBuffer;
-}
-
-int Kyber512State_generate(Kyber512State* self) {
-	return crypto_kem_enc(
-		self->m_keyEncapsulationMessageBuffer,
-		self->m_sharedSecretBuffer,
-		self->m_publicKey
-	);
-}
-
-int Kyber512State_decode(Kyber512State* self) {
-	return crypto_kem_dec(
-		self->m_sharedSecretBuffer,
-		self->m_keyEncapsulationMessageBuffer,
-		self->m_privateKey
-	);
-}
diff --git a/src/kyber512/wrapper.h b/src/kyber512/wrapper.h
deleted file mode 100644
index 93c754c..0000000
--- a/src/kyber512/wrapper.h
+++ /dev/null
@@ -1,70 +0,0 @@
-#ifndef _KYBER512_WRAPPER_H_
-#define _KYBER512_WRAPPER_H_
-
-#include <stdint.h>
-#include <stddef.h>
-
-#define KYBER512_PUBLIC_KEY_SIZE 800
-#define KYBER512_PRIVATE_KEY_SIZE 1632
-#define KYBER512_SHARED_SECRET_SIZE 32
-#define KYBER512_CIPHERTEXT_SIZE 768
-
-/**
- * Simple object-oriented wrapper around the various Dilithium functions.
- * All "methods" start with the prefix "Kyber512State_".
- */
-typedef struct Kyber512State_t {
-	uint8_t m_publicKey[KYBER512_PUBLIC_KEY_SIZE];
-	uint8_t m_privateKey[KYBER512_PRIVATE_KEY_SIZE];
-	uint8_t m_sharedSecretBuffer[KYBER512_SHARED_SECRET_SIZE];
-	uint8_t m_keyEncapsulationMessageBuffer[KYBER512_CIPHERTEXT_SIZE];
-} Kyber512State;
-
-/**
- * Get or set the private key bytes. The buffer returned by this method has
- * size KYBER512_PRIVATE_KEY_SIZE.
- */
-uint8_t* Kyber512State_getPrivateKey(Kyber512State* self);
-
-/**
- * Get or set the public key bytes. The buffer returned by this method has size
- * KYBER512_PUBLIC_KEY_SIZE.
- */
-uint8_t* Kyber512State_getPublicKey(Kyber512State* self);
-
-/**
- * Get or set the shared secret. The buffer returned by this method has size
- * KYBER512_SHARED_SECRET_SIZE.
- */
-uint8_t* Kyber512State_getSharedSecretBuffer(Kyber512State* self);
-
-/**
- * Get or set the key encapsulation message. The buffer returned by this method
- * has size KYBER512_CIPHERTEXT_SIZE.
- */
-uint8_t* Kyber512State_getKeyEncapsulationMessageBuffer(Kyber512State* self);
-
-/**
- * Generate a shared secret, as well as an accompanying key encapsulation
- * message (the ciphertext) that is to be sent over a hypothetical public
- * channel.
- * 
- * The generated shared secret can be retrieved via the
- * Kyber512State_getSharedSecretBuffer method.
- * 
- * The key encapsulation message can be retrieved via the
- * Kyber512State_getKeyEncapsulationMessageBuffer method.
- */
-int Kyber512State_generate(Kyber512State* self);
-
-/**
- * Decode the key encapsulation message into a shared secret, using the
- * key encapsulation message that was written to the buffer pointed to by
- * Kyber512State_getKeyEncapsulationMessageBuffer.
- * 
- * The shared secret can be retrieved via the
- * Kyber512State_getSharedSecretBuffer method.
- */
-int Kyber512State_decode(Kyber512State* self);
-
-#endif // _KYBER512_WRAPPER_H_
diff --git a/src/main.c b/src/main.c
index 86a4770..0675cdb 100755
--- a/src/main.c
+++ b/src/main.c
@@ -1,12 +1,21 @@
-//Main file for Riscure Pinata Board rev3.0
-//Riscure 2014, 2015, 2016, 2017, 2018, 2019
+//Main file for Riscure Pinata Board rev4.0
+//Riscure 2014, 2015, 2016, 2017, 2018, 2019, 2026
 //
-//Code revision: 	3.2 -- 20190808v1
+//Code revision: 	4.0 -- 2026/03/24
 //
 //IMPORTANT:
 //Presence of hardware crypto engine is defined via the Makefile
 //Whether to include PQC algorithms (and exclude classic ciphers) is also decided via the Makefile
 //
+//Changelog from code revision 4.0 from 3.2
+//
+// - BREAKING CHANGE: Update cipher: Kyber     NIST Round 3 -> ML-KEM FIPS-203
+// - BREAKING CHANGE: Update cipher: Dilithium NIST Round 3 -> ML-DSA FIPS-204
+// 
+// - Notes: ML-DSA private key size has CHANGED! From 4016 to 4032 bytes.
+//          ML-DSA signature size has CHANGED! From 3293 to 3309 bytes.
+//          Please update your acquisition scripts!
+//
 //Changelog from code revision 3.2 from 3.1
 //
 // Added Kyber512 key encapsulation cryptosystem.
@@ -60,8 +69,9 @@
 #include "io.h"
 
 #ifdef VARIANT_PQC
-#include "dilithium/wrapper.h"
-#include "kyber512/wrapper.h"
+#include "mldsa/wrapper.h"
+#include "mlkem/wrapper.h"
+#include "pqm4_hal/pinata_callbacks.h"
 #endif
 
 //Local functions
@@ -97,7 +107,7 @@ uint8_t rxBuffer[RXBUFFERLENGTH] = {};
 const uint8_t zeros[20]={'0','0','0','0','0','0','0','0','0','0','0','0','0','0','0','0','0','0','0','0'};
 const uint8_t glitched[] = { 0xFA, 0xCC };
 const uint8_t cmdByteIsWrong[] = { 'B','a','d','C','m','d','\n',0x00};
-const uint8_t codeVersion[] = { 'V','e','r',' ','3','.','2',0x00};
+const uint8_t codeVersion[] = { 'V','e','r',' ','4','.','0',0x00};
 
 volatile uint8_t usbSerialEnabled=0;
 volatile int busyWait1;
@@ -113,8 +123,14 @@ unsigned char etxBuf[256] ={};
 #define END_INTERESTING_STUFF GPIOC->BSRRH = GPIO_Pin_2
 
 #ifdef VARIANT_PQC
-DilithiumState dilithium;
-Kyber512State kyber512;
+MlDsaState g_mldsa;
+MlKemState g_mlkem;
+void handle_mldsa_sign_start() {
+	GPIOC->BSRRL = GPIO_Pin_1;
+}
+void handle_mldsa_sign_finish() {
+	GPIOC->BSRRH = GPIO_Pin_1;
+}
 #endif
 
 ////////////////////////////////////////////////////
@@ -124,8 +140,10 @@ int main(void) {
 	uint8_t cmd;
 	uint8_t tmp;
 
-#ifndef VARIANT_PQC
-
+#ifdef VARIANT_PQC
+	PINATA_PATCH_mldsa_set_sign_start_callback(&handle_mldsa_sign_start);
+	PINATA_PATCH_mldsa_set_sign_finish_callback(&handle_mldsa_sign_finish);
+#else
 	int payload_len, i, glitchedBoot, authenticated, counter=0;
 	ErrorStatus cryptoCompletedOK=ERROR;
 	//We will need ROUNDS + 1 keys to be generated by the key schedule (multiplied by 4 because we can only store 32 bits at a time).
@@ -238,29 +256,29 @@ int main(void) {
 
 #ifdef VARIANT_PQC
 
-			case CMD_SW_DILITHIUM_GET_VARIANT:
+			case CMD_SW_MLDSA_GET_VARIANT:
 				// Return the response.
-				send_char(getDilithiumAlgorithmVariant());
+				send_char(getMlDsaAlgorithmVariant());
 				break;
 
-			case CMD_SW_DILITHIUM_SET_PUBLIC_AND_PRIVATE_KEY: {
+			case CMD_SW_MLDSA_SET_PUBLIC_AND_PRIVATE_KEY: {
 				// Receive the input parameters and handle the request.
-				get_bytes(DILITHIUM_PUBLIC_KEY_SIZE, DilithiumState_getPublicKey(&dilithium));
-				get_bytes(DILITHIUM_PRIVATE_KEY_SIZE, DilithiumState_getPrivateKey(&dilithium));
+				get_bytes(MLDSA_PUBLIC_KEY_SIZE, MlDsaState_getPublicKey(&g_mldsa));
+				get_bytes(MLDSA_PRIVATE_KEY_SIZE, MlDsaState_getPrivateKey(&g_mldsa));
 
 				// Return the response.
 				send_char(0);
 				break;
 			}
 
-			case CMD_SW_DILITHIUM_VERIFY: {
+			case CMD_SW_MLDSA_VERIFY: {
 				// Receive the input parameters.
-				uint8_t* signedMessageBuffer = DilithiumState_getScratchPad(&dilithium);
-				get_bytes(DILITHIUM_SIGNED_MESSAGE_SIZE, signedMessageBuffer);
+				uint8_t* signedMessageBuffer = MlDsaState_getScratchPad(&g_mldsa);
+				get_bytes(MLDSA_SIGNED_MESSAGE_SIZE, signedMessageBuffer);
 
 				// Handle the request.
 				BEGIN_INTERESTING_STUFF;
-				int result = DilithiumState_verify(&dilithium, signedMessageBuffer);
+				int result = MlDsaState_verify(&g_mldsa, signedMessageBuffer);
 				END_INTERESTING_STUFF;
 
 				// Return the response.
@@ -268,20 +286,26 @@ int main(void) {
 				break;	
 			}
 
-			case CMD_SW_DILITHIUM_SIGN: {
+			case CMD_SW_MLDSA_SIGN: {
 				// Receive the input parameters.
-				uint8_t* signedMessageBuffer = DilithiumState_getScratchPad(&dilithium);
-				get_bytes(DILITHIUM_MESSAGE_SIZE, signedMessageBuffer + DILITHIUM_SIGNATURE_SIZE);
+				uint8_t* signedMessageBuffer = MlDsaState_getScratchPad(&g_mldsa);
+				get_bytes(MLDSA_MESSAGE_SIZE, signedMessageBuffer + MLDSA_SIGNATURE_SIZE);
 
 				// Handle the request.
+				// Note: on GPIO Pin 1, the "c*s1 multiplication in the NTT" is signaled.
+				// Use GPIO Pin 1 as trigger if you need only that part, and trigger on a falling edge.
+				// Triggering on a falling edge is necessary due to ML-DSA being a variable-length algorithm
+				// (due to so-called "rejection-sampling", which makes it start over).
+				// The {BEGIN,END}_INTERESTING_STUFF macros set GPIO Pin 2 to high/low, which can be used
+				// as debugging tool.
 				BEGIN_INTERESTING_STUFF;
-				int result = DilithiumState_sign(&dilithium, signedMessageBuffer, signedMessageBuffer + DILITHIUM_SIGNATURE_SIZE);
+				int result = MlDsaState_sign(&g_mldsa, signedMessageBuffer, signedMessageBuffer + MLDSA_SIGNATURE_SIZE);
 				END_INTERESTING_STUFF;
 
 				if (result == 0) {
 					// OK: The message is now signed, let's send the signature of the message back.
 					send_char(0);
-					send_bytes(DILITHIUM_SIGNATURE_SIZE, signedMessageBuffer);
+					send_bytes(MLDSA_SIGNATURE_SIZE, signedMessageBuffer);
 				} else {
 					// ERROR: Signing the message failed.
 					send_char(1);
@@ -289,45 +313,45 @@ int main(void) {
 				break;
 			}
 
-			case CMD_SW_DILITHIUM_GET_KEY_SIZES: {
-				const uint16_t publicKeySize = DILITHIUM_PUBLIC_KEY_SIZE;
-				const uint16_t privateKeySize = DILITHIUM_PRIVATE_KEY_SIZE;
+			case CMD_SW_MLDSA_GET_KEY_SIZES: {
+				const uint16_t publicKeySize = MLDSA_PUBLIC_KEY_SIZE;
+				const uint16_t privateKeySize = MLDSA_PRIVATE_KEY_SIZE;
 				// Send the response; MUST be in little-endian order!
 				send_bytes(sizeof(publicKeySize), (const uint8_t*)&publicKeySize);
 				send_bytes(sizeof(privateKeySize), (const uint8_t*)&privateKeySize);
 				break;
 			}
 
-			case CMD_SW_DILITHIUM_NTT: {
-				int32_t polynomialBuffer[DILITHIUM_N];
+			case CMD_SW_MLDSA_NTT: {
+				int32_t polynomialBuffer[MLDSA_N];
 				// Receive the polynomial coefficients.
-				get_bytes(sizeof(int32_t)*DILITHIUM_N, (uint8_t*)polynomialBuffer);
+				get_bytes(sizeof(int32_t)*MLDSA_N, (uint8_t*)polynomialBuffer);
 				BEGIN_INTERESTING_STUFF;
-				Dilithium_ntt(polynomialBuffer);
+				MlDsa_ntt(polynomialBuffer);
 				END_INTERESTING_STUFF;
 				// No reply is sent.
 				break;
 			}
 
-			case CMD_SW_KYBER512_SET_PUBLIC_AND_PRIVATE_KEY: {
+			case CMD_SW_MLKEM_SET_PUBLIC_AND_PRIVATE_KEY: {
 				// Receive the input parameters and handle the request.
-				get_bytes(KYBER512_PUBLIC_KEY_SIZE, Kyber512State_getPublicKey(&kyber512));
-				get_bytes(KYBER512_PRIVATE_KEY_SIZE, Kyber512State_getPrivateKey(&kyber512));
+				get_bytes(MLKEM_PUBLIC_KEY_SIZE, MlKemState_getPublicKey(&g_mlkem));
+				get_bytes(MLKEM_PRIVATE_KEY_SIZE, MlKemState_getPrivateKey(&g_mlkem));
 				// Return the response.
 				send_char(0);
 				break;
 			}
 
-			case CMD_SW_KYBER512_GENERATE: {
+			case CMD_SW_MLKEM_GENERATE: {
 				// Generate a shared secret and an accompanying key encapsulation message.
 				BEGIN_INTERESTING_STUFF;
-				int result = Kyber512State_generate(&kyber512);
+				int result = MlKemState_generate(&g_mlkem);
 				END_INTERESTING_STUFF;
 				if (result == 0) {
 					// OK: The shared secret is now generated and encapsulated, let's send that back.
 					send_char(0);
-					send_bytes(KYBER512_SHARED_SECRET_SIZE, Kyber512State_getSharedSecretBuffer(&kyber512));
-					send_bytes(KYBER512_CIPHERTEXT_SIZE, Kyber512State_getKeyEncapsulationMessageBuffer(&kyber512));
+					send_bytes(MLKEM_SHARED_SECRET_SIZE, MlKemState_getSharedSecretBuffer(&g_mlkem));
+					send_bytes(MLKEM_CIPHERTEXT_SIZE, MlKemState_getKeyEncapsulationMessageBuffer(&g_mlkem));
 				} else {
 					// ERROR: Generation failed.
 					send_char(1);
@@ -335,18 +359,18 @@ int main(void) {
 				break;
 			}
 
-			case CMD_SW_KYBER512_DEC: {
+			case CMD_SW_MLKEM_DEC: {
 				// Receive the key encapsulation message that we are supposed to decode.
-				get_bytes(KYBER512_CIPHERTEXT_SIZE, Kyber512State_getKeyEncapsulationMessageBuffer(&kyber512));
-				memset(Kyber512State_getSharedSecretBuffer(&kyber512), 0, KYBER512_SHARED_SECRET_SIZE);
+				get_bytes(MLKEM_CIPHERTEXT_SIZE, MlKemState_getKeyEncapsulationMessageBuffer(&g_mlkem));
+				memset(MlKemState_getSharedSecretBuffer(&g_mlkem), 0, MLKEM_SHARED_SECRET_SIZE);
 				// Decode the key encapsulation message into a shared secret.
 				BEGIN_INTERESTING_STUFF;
-				int result = Kyber512State_decode(&kyber512);
+				int result = MlKemState_decode(&g_mlkem);
 				END_INTERESTING_STUFF;
 				if (result == 0) {
 					// OK: The shared secret is decoded, let's send that back.
 					send_char(0);
-					send_bytes(KYBER512_SHARED_SECRET_SIZE, Kyber512State_getSharedSecretBuffer(&kyber512));
+					send_bytes(MLKEM_SHARED_SECRET_SIZE, MlKemState_getSharedSecretBuffer(&g_mlkem));
 				} else {
 					// ERROR: Decoding the shared secret failed.
 					send_char(1);
@@ -354,9 +378,9 @@ int main(void) {
 				break;
 			}
 
-			case CMD_SW_KYBER512_GET_KEY_SIZES: {
-				const uint16_t publicKeySize = KYBER512_PUBLIC_KEY_SIZE;
-				const uint16_t privateKeySize = KYBER512_PRIVATE_KEY_SIZE;
+			case CMD_SW_MLKEM_GET_KEY_SIZES: {
+				const uint16_t publicKeySize = MLKEM_PUBLIC_KEY_SIZE;
+				const uint16_t privateKeySize = MLKEM_PRIVATE_KEY_SIZE;
 				// Send the response; MUST be in little-endian order!
 				send_bytes(sizeof(publicKeySize), (const uint8_t*)&publicKeySize);
 				send_bytes(sizeof(privateKeySize), (const uint8_t*)&privateKeySize);
diff --git a/src/main.h b/src/main.h
index baa1c1c..d86b705 100755
--- a/src/main.h
+++ b/src/main.h
@@ -39,7 +39,7 @@
 #include "sm4/sm4.h"
 #include "tea/tea.h"
 #include "present/present.h"
-#include "dilithium/wrapper.h"
+#include "mldsa/wrapper.h"
 #endif
 
 //ANSSI AES - see https://github.com/ANSSI-FR/SecAESSTM32
@@ -82,17 +82,17 @@
 
 //Pinata board crypto command bytes definition
 
-/// Set the public and private key for the Kyber512 crypto-system.
+/// Set the public and private key for the ML-KEM crypto-system.
 /// The public and private key MUST be valid. No validation is
 /// done by the Pinata.
 ///
 /// Expected Input:
-///   public key bytes of size KYBER512_PUBLIC_KEY_SIZE, followed by
-///   private key bytes of size KYBER512_PRIVATE_KEY_SIZE.
+///   public key bytes of size MLKEM512_PUBLIC_KEY_SIZE, followed by
+///   private key bytes of size MLKEM512_PRIVATE_KEY_SIZE.
 ///
 /// Output:
 ///   One byte; the byte is always zero.
-#define CMD_SW_KYBER512_SET_PUBLIC_AND_PRIVATE_KEY 0x02
+#define CMD_SW_MLKEM_SET_PUBLIC_AND_PRIVATE_KEY 0x02
 
 /// Get the public and private key sizes.
 ///
@@ -102,7 +102,7 @@
 /// Output:
 ///   16-bit unsigned integer in little endian order that contains the public key size, followed by
 ///   16-bit unsigned integer in little endian order that contains the private key size
-#define CMD_SW_KYBER512_GET_KEY_SIZES 0x03
+#define CMD_SW_MLKEM_GET_KEY_SIZES 0x03
 
 /// Generate a shared secret, as well as an accompanying key encapsulation
 /// message (the ciphertext) that is to be sent over a hypothetical public
@@ -110,30 +110,30 @@
 ///
 /// The shared secret is and key encapsulation mesage are generated using the
 /// public key that was set via the command
-/// CMD_SW_KYBER512_SET_PUBLIC_AND_PRIVATE_KEY.
+/// CMD_SW_MLKEM_SET_PUBLIC_AND_PRIVATE_KEY.
 ///
 /// Expected Input:
 ///   None
 ///
 /// Output:
 ///   If generation succeeded, returns a single byte with value 0, followed by
-///   shared secret bytes of size KYBER512_SHARED_SECRET_SIZE, followed by
-///   key encapsulation message (the ciphertext) of size KYBER512_CIPHERTEXT_SIZE
+///   shared secret bytes of size MLKEM_SHARED_SECRET_SIZE, followed by
+///   key encapsulation message (the ciphertext) of size MLKEM_CIPHERTEXT_SIZE
 ///
 ///   If generation failed, returns a single byte with value 1.
-#define CMD_SW_KYBER512_GENERATE 0x04
+#define CMD_SW_MLKEM_GENERATE 0x04
 
 /// Decrypt a key encapsulation message into a shared secret.
 ///
 /// The shared secret is decrypted using the private key that was set via
-/// CMD_SW_KYBER512_SET_PUBLIC_AND_PRIVATE_KEY.
+/// CMD_SW_MLKEM_SET_PUBLIC_AND_PRIVATE_KEY.
 ///
 /// Expected Input:
-///   key encapsulation message (the ciphertext) of size KYBER512_CIPHERTEXT_SIZE
+///   key encapsulation message (the ciphertext) of size MLKEM_CIPHERTEXT_SIZE
 ///
 /// Output:
-///   shared secret bytes of size KYBER512_SHARED_SECRET_SIZE
-#define CMD_SW_KYBER512_DEC 0x05
+///   shared secret bytes of size MLKEM_SHARED_SECRET_SIZE
+#define CMD_SW_MLKEM_DEC 0x05
 
 #define CMD_SWDES_ENC 0x44
 #define CMD_SWDES_DEC 0x45
@@ -160,51 +160,51 @@
 #define CMD_SWXTEA_ENC 0x6E
 #define CMD_SWXTEA_DEC 0x6F
 
-/// Return the Dilithium algorithm variant used in this implementation.
+/// Return the ML-DSA algorithm variant used in this implementation.
 /// The variant is one of the identifiers 1, 2, 3 or 4.
 ///
 /// Expected Input:
 ///   None
 ///
 /// Output:
-///   A single byte whose value is the Dilithium variant.
-#define CMD_SW_DILITHIUM_GET_VARIANT 0x90
+///   A single byte whose value is the ML-DSA variant.
+#define CMD_SW_MLDSA_GET_VARIANT 0x90
 
-/// Set the public and private key for the Dilithium crypto-system.
+/// Set the public and private key for the ML-DSA crypto-system.
 /// The public and private key MUST be valid. No validation is
 /// done by the Pinata.
 ///
 /// Expected Input:
-///   public key bytes of size DILITHIUM_PUBLIC_KEY_SIZE, followed by
-///   private key bytes of size DILITHIUM_PRIVATE_KEY_SIZE.
+///   public key bytes of size MLDSA_PUBLIC_KEY_SIZE, followed by
+///   private key bytes of size MLDSA_PRIVATE_KEY_SIZE.
 ///
 /// Output:
 ///   One byte; the byte is always zero.
-#define CMD_SW_DILITHIUM_SET_PUBLIC_AND_PRIVATE_KEY 0x91
+#define CMD_SW_MLDSA_SET_PUBLIC_AND_PRIVATE_KEY 0x91
 
 /// Verify a signed message, using the public key provided via
-/// CMD_SW_DILITHIUM_SET_PUBLIC_AND_PRIVATE_KEY.
+/// CMD_SW_MLDSA_SET_PUBLIC_AND_PRIVATE_KEY.
 ///
 /// Expected Input:
-///   Signature of length DILITHIUM_SIGNATURE_SIZE, followed by
-///   Message of length PINATA_DILITHIUM_MESSAGE_LENGTH
+///   Signature of length MLDSA_SIGNATURE_SIZE, followed by
+///   Message of length PINATA_MLDSA_MESSAGE_LENGTH
 ///
-///   (in other words, a "signed message" of size PINATA_DILITHIUM_SIGNED_MESSAGE_SIZE).
+///   (in other words, a "signed message" of size PINATA_MLDSA_SIGNED_MESSAGE_SIZE).
 ///
 /// Output:
 ///   One byte; the byte is 0 if the signature of the message is valid,
 ///   non-zero otherwise.
-#define CMD_SW_DILITHIUM_VERIFY 0x92
+#define CMD_SW_MLDSA_VERIFY 0x92
 
 /// Sign a message, using the private key provided via
-/// CMD_SW_DILITHIUM_SET_PUBLIC_AND_PRIVATE_KEY.
+/// CMD_SW_MLDSA_SET_PUBLIC_AND_PRIVATE_KEY.
 ///
 /// Expected Input:
-///   message of length PINATA_DILITHIUM_MESSAGE_LENGTH bytes.
+///   message of length PINATA_MLDSA_MESSAGE_LENGTH bytes.
 ///
 /// Output:
-///   Signature of the message. The signature has size DILITHIUM_SIGNATURE_SIZE.
-#define CMD_SW_DILITHIUM_SIGN 0x93
+///   Signature of the message. The signature has size MLDSA_SIGNATURE_SIZE.
+#define CMD_SW_MLDSA_SIGN 0x93
 
 /// Get the public and private key sizes.
 ///
@@ -214,16 +214,16 @@
 /// Output:
 ///   16-bit unsigned integer in little endian order that contains the public key size, followed by
 ///   16-bit unsigned integer in little endian order that contains the private key size
-#define CMD_SW_DILITHIUM_GET_KEY_SIZES 0x94
+#define CMD_SW_MLDSA_GET_KEY_SIZES 0x94
 
-/// Perform Dilithium NTT.
+/// Perform ML-DSA NTT.
 ///
 /// Expected Input:
-///   A total of DILITHIUM_N 32-bit integers in little endian order.
+///   A total of MLDSA_N 32-bit integers in little endian order.
 ///
 /// Output:
 ///   No reply is sent back.
-#define CMD_SW_DILITHIUM_NTT 0x9A
+#define CMD_SW_MLDSA_NTT 0x9A
 
 #define CMD_SWDES_ENC_MISALIGNED 0x14
 #define CMD_SWAES128_ENC_MISALIGNED 0x1E
diff --git a/src/dilithium.inc b/src/mldsa.inc
similarity index 99%
rename from src/dilithium.inc
rename to src/mldsa.inc
index 3118eed..d44b384 100644
--- a/src/dilithium.inc
+++ b/src/mldsa.inc
@@ -1,4 +1,4 @@
-DilithiumState dilithium = {
+MlDsa mldsa = {
     .m_pk =
         {0x53, 0x0f, 0x8a, 0xfb, 0xc7, 0x45, 0x36, 0xb9, 0xa9, 0x63, 0xb4, 0xf1,
          0xc4, 0xcb, 0x73, 0x8b, 0xce, 0xa7, 0x40, 0x3d, 0x4d, 0x60, 0x6b, 0x6e,
diff --git a/src/mldsa/CMakeLists.txt b/src/mldsa/CMakeLists.txt
new file mode 100644
index 0000000..566ffd9
--- /dev/null
+++ b/src/mldsa/CMakeLists.txt
@@ -0,0 +1 @@
+target_licensed_sources(wrapper.c wrapper.h)
diff --git a/src/mldsa/wrapper.c b/src/mldsa/wrapper.c
new file mode 100755
index 0000000..f0b1481
--- /dev/null
+++ b/src/mldsa/wrapper.c
@@ -0,0 +1,54 @@
+#include "wrapper.h"
+
+// These includes MUST stay private to wrapper.c,
+// otherwise we pollute the global namespace with equally named,
+// but totally different files (api.h / param.h)
+#include <params.h> // include is located in pqm4 source tree
+#include <api.h>    // include is located in pqm4 source tree
+#include <sign.h>   // include is located in pqm4 source tree
+#include <poly.h>   // include is located in pqm4 source tree
+
+#if MLDSA_PUBLIC_KEY_SIZE != CRYPTO_PUBLICKEYBYTES
+#error invalid public key size, update me!
+#endif
+#if MLDSA_PRIVATE_KEY_SIZE != CRYPTO_SECRETKEYBYTES
+#error invalid private key size, update me!
+#endif
+#if MLDSA_SIGNATURE_SIZE != CRYPTO_BYTES
+#error invalid signature size, update me!
+#endif
+#if MLDSA_N != N
+#error invalid N, update me!
+#endif
+
+int getMlDsaAlgorithmVariant() {
+	return DILITHIUM_MODE;
+}
+
+uint8_t* MlDsaState_getPrivateKey(MlDsaState* self) {
+	return self->m_sk;
+}
+
+uint8_t* MlDsaState_getPublicKey(MlDsaState* self) {
+	return self->m_pk;
+}
+
+uint8_t* MlDsaState_getScratchPad(MlDsaState* self) {
+	return self->m_scratchpad;
+}
+
+int MlDsaState_verify(const MlDsaState* self, uint8_t *signedMessage) {
+	size_t messageLength = MLDSA_MESSAGE_SIZE;
+	return crypto_sign_open(signedMessage, &messageLength, signedMessage, MLDSA_SIGNED_MESSAGE_SIZE, self->m_pk);
+}
+
+int MlDsaState_sign(const MlDsaState* self, uint8_t* signature, const uint8_t* message) {
+	size_t signatureSize = MLDSA_SIGNATURE_SIZE;
+	return crypto_sign_signature(signature, &signatureSize, message, MLDSA_MESSAGE_SIZE, self->m_sk);
+}
+
+int MlDsa_ntt(uint32_t* coefficients) {
+	poly* coeffs = (poly*)coefficients;
+	poly_ntt(coeffs);
+	return 0;
+}
diff --git a/src/mldsa/wrapper.h b/src/mldsa/wrapper.h
new file mode 100755
index 0000000..8a36aa3
--- /dev/null
+++ b/src/mldsa/wrapper.h
@@ -0,0 +1,93 @@
+#ifndef _MLDSA_WRAPPER_H_
+#define _MLDSA_WRAPPER_H_
+
+#include <stdint.h>
+#include <stddef.h>
+
+#define MLDSA_PUBLIC_KEY_SIZE 1952
+#define MLDSA_PRIVATE_KEY_SIZE 4032
+#define MLDSA_SIGNATURE_SIZE 3309
+#define MLDSA_MESSAGE_SIZE 16
+#define MLDSA_N 256
+#define MLDSA_SIGNED_MESSAGE_SIZE (MLDSA_SIGNATURE_SIZE + MLDSA_MESSAGE_SIZE)
+
+/**
+ * @brief      Get the ML-DSA algorithm variant. There are a few variants and
+ *             only one of them is implemented.
+ *
+ * @return     The ML-DSA algorithm variant.
+ */
+int getMlDsaAlgorithmVariant();
+
+/**
+ * Simple object-oriented wrapper around the various MlDsa functions.
+ * All "methods" start with the prefix "MlDsaState_".
+ */
+typedef struct MlDsaState_t {
+	uint8_t m_pk[MLDSA_PUBLIC_KEY_SIZE];
+	uint8_t m_sk[MLDSA_PRIVATE_KEY_SIZE];
+    uint8_t m_scratchpad[MLDSA_SIGNED_MESSAGE_SIZE];
+} MlDsaState;
+
+/**
+ * @brief      Get the private key bytes.
+ *
+ * @param[in]  self  The object
+ *
+ * @return     The private key bytes.
+ */
+uint8_t* MlDsaState_getPrivateKey(MlDsaState* self);
+
+/**
+ * @brief      Get the public key bytes.
+ *
+ * @param[in]  self  The object
+ *
+ * @return     The public key bytes.
+ */
+uint8_t* MlDsaState_getPublicKey(MlDsaState* self);
+
+/**
+ * @brief      Get the "scratch pad" for message storage, signature storage.
+ *
+ * @param      self  The object
+ *
+ * @return     Pointer to the scratch pad.
+ */
+uint8_t* MlDsaState_getScratchPad(MlDsaState* self);
+
+/**
+ * @brief      Verify a signed message.
+ *
+ * @param[in]  self           The object
+ * @param[in]  signature      Buffer of the signed message. This buffer MUST have
+ *                            length MLDSA_SIGNATURE_SIZE + MLDSA_MESSAGE_SIZE.
+ *
+ * @return     0 when verification passes, non-zero otherwise.
+ */
+int MlDsaState_verify(const MlDsaState* self, uint8_t *signedMessage);
+
+/**
+ * @brief      Sign a message.
+ *
+ * @param[in]  self           The object
+ * @param[out] signature      Buffer where the signature will be placed in. This
+ *                            buffer MUST have length MLDSA_SIGNATURE_SIZE.
+ * @param[in]  message        Buffer of the message to be signed. This buffer MUST
+ *                            have length MLDSA_MESSAGE_SIZE.
+ *
+ * @return     0 when signing succeeds, non-zero otherwise.
+ */
+int MlDsaState_sign(const MlDsaState* self, uint8_t* signature, const uint8_t* message);
+
+///
+/// @brief        Perform a forward NTT.
+///
+/// @param[inout] coefficients  Buffer of polynomial coefficients in integer
+///                             domain. The computation is done in-place, and
+///                             this array contains the coefficients in the
+///                             frequency domain after this function returns.
+///
+int MlDsa_ntt(uint32_t *coefficients);
+
+#endif // _MLDSA_WRAPPER_H_
diff --git a/src/mlkem/CMakeLists.txt b/src/mlkem/CMakeLists.txt
new file mode 100644
index 0000000..566ffd9
--- /dev/null
+++ b/src/mlkem/CMakeLists.txt
@@ -0,0 +1 @@
+target_licensed_sources(wrapper.c wrapper.h)
diff --git a/src/mlkem/wrapper.c b/src/mlkem/wrapper.c
new file mode 100644
index 0000000..95db8a2
--- /dev/null
+++ b/src/mlkem/wrapper.c
@@ -0,0 +1,52 @@
+#include "wrapper.h"
+
+// These includes MUST stay private to wrapper.c,
+// otherwise we pollute the global namespace with equally named,
+// but totally different files (api.h / param.h)
+#include <params.h>  // include is located in pqm4 source tree
+#include <api.h>     // include is located in pqm4 source tree
+
+#if MLKEM_PUBLIC_KEY_SIZE != KYBER_PUBLICKEYBYTES
+#error invalid public key size, update me!
+#endif
+#if MLKEM_PRIVATE_KEY_SIZE != KYBER_SECRETKEYBYTES
+#error invalid private key size, update me!
+#endif
+#if MLKEM_SHARED_SECRET_SIZE != KYBER_SSBYTES
+#error invalid shared secret size, update me!
+#endif
+#if MLKEM_CIPHERTEXT_SIZE != KYBER_CIPHERTEXTBYTES
+#error invalid key encapsulation message size, update me!
+#endif
+
+uint8_t* MlKemState_getPrivateKey(MlKemState* self) {
+	return self->m_privateKey;
+}
+
+uint8_t* MlKemState_getPublicKey(MlKemState* self) {
+	return self->m_publicKey;
+}
+
+uint8_t* MlKemState_getSharedSecretBuffer(MlKemState* self) {
+	return self->m_sharedSecretBuffer;
+}
+
+uint8_t* MlKemState_getKeyEncapsulationMessageBuffer(MlKemState* self) {
+	return self->m_keyEncapsulationMessageBuffer;
+}
+
+int MlKemState_generate(MlKemState* self) {
+	return crypto_kem_enc(
+		self->m_keyEncapsulationMessageBuffer,
+		self->m_sharedSecretBuffer,
+		self->m_publicKey
+	);
+}
+
+int MlKemState_decode(MlKemState* self) {
+	return crypto_kem_dec(
+		self->m_sharedSecretBuffer,
+		self->m_keyEncapsulationMessageBuffer,
+		self->m_privateKey
+	);
+}
diff --git a/src/mlkem/wrapper.h b/src/mlkem/wrapper.h
new file mode 100644
index 0000000..7cfcbac
--- /dev/null
+++ b/src/mlkem/wrapper.h
@@ -0,0 +1,70 @@
+#ifndef _MLKEM_WRAPPER_H_
+#define _MLKEM_WRAPPER_H_
+
+#include <stdint.h>
+#include <stddef.h>
+
+#define MLKEM_PUBLIC_KEY_SIZE 800
+#define MLKEM_PRIVATE_KEY_SIZE 1632
+#define MLKEM_SHARED_SECRET_SIZE 32
+#define MLKEM_CIPHERTEXT_SIZE 768
+
+/**
+ * Simple object-oriented wrapper around the various Dilithium functions.
+ * All "methods" start with the prefix "MlKemState_".
+ */
+typedef struct MlKemState_t {
+	uint8_t m_publicKey[MLKEM_PUBLIC_KEY_SIZE];
+	uint8_t m_privateKey[MLKEM_PRIVATE_KEY_SIZE];
+	uint8_t m_sharedSecretBuffer[MLKEM_SHARED_SECRET_SIZE];
+	uint8_t m_keyEncapsulationMessageBuffer[MLKEM_CIPHERTEXT_SIZE];
+} MlKemState;
+
+/**
+ * Get or set the private key bytes. The buffer returned by this method has
+ * size MLKEM_PRIVATE_KEY_SIZE.
+ */
+uint8_t* MlKemState_getPrivateKey(MlKemState* self);
+
+/**
+ * Get or set the public key bytes. The buffer returned by this method has size
+ * MLKEM_PUBLIC_KEY_SIZE.
+ */
+uint8_t* MlKemState_getPublicKey(MlKemState* self);
+
+/**
+ * Get or set the shared secret. The buffer returned by this method has size
+ * MLKEM_SHARED_SECRET_SIZE.
+ */
+uint8_t* MlKemState_getSharedSecretBuffer(MlKemState* self);
+
+/**
+ * Get or set the key encapsulation message. The buffer returned by this method
+ * has size MLKEM_CIPHERTEXT_SIZE.
+ */
+uint8_t* MlKemState_getKeyEncapsulationMessageBuffer(MlKemState* self);
+
+/**
+ * Generate a shared secret, as well as an accompanying key encapsulation
+ * message (the ciphertext) that is to be sent over a hypothetical public
+ * channel.
+ * 
+ * The generated shared secret can be retrieved via the
+ * MlKemState_getSharedSecretBuffer method.
+ * 
+ * The key encapsulation message can be retrieved via the
+ * MlKemState_getKeyEncapsulationMessageBuffer method.
+ */
+int MlKemState_generate(MlKemState* self);
+
+/**
+ * Decode the key encapsulation message into a shared secret, using the
+ * key encapsulation message that was written to the buffer pointed to by
+ * MlKemState_getKeyEncapsulationMessageBuffer.
+ * 
+ * The shared secret can be retrieved via the
+ * MlKemState_getSharedSecretBuffer method.
+ */
+int MlKemState_decode(MlKemState* self);
+
+#endif // _MLKEM_WRAPPER_H_
diff --git a/src/pqm4_hal/pinata_callbacks.c b/src/pqm4_hal/pinata_callbacks.c
new file mode 100644
index 0000000..68434fd
--- /dev/null
+++ b/src/pqm4_hal/pinata_callbacks.c
@@ -0,0 +1,17 @@
+#include "pinata_callbacks.h"
+#include <stddef.h>
+
+PINATA_PATCH_mldsa_sign_start_callback_t PINATA_PATCH_mldsa_start_callback = NULL;
+PINATA_PATCH_mldsa_sign_finish_callback_t PINATA_PATCH_mldsa_finish_callback = NULL;
+
+PINATA_PATCH_mldsa_sign_start_callback_t PINATA_PATCH_mldsa_set_sign_start_callback(PINATA_PATCH_mldsa_sign_start_callback_t f) {
+	PINATA_PATCH_mldsa_sign_start_callback_t old = PINATA_PATCH_mldsa_start_callback;
+	PINATA_PATCH_mldsa_start_callback = f;
+	return old;
+}
+
+PINATA_PATCH_mldsa_sign_finish_callback_t PINATA_PATCH_mldsa_set_sign_finish_callback(PINATA_PATCH_mldsa_sign_finish_callback_t f) {
+	PINATA_PATCH_mldsa_sign_finish_callback_t old = PINATA_PATCH_mldsa_finish_callback;
+	PINATA_PATCH_mldsa_finish_callback = f;
+	return old;
+}
diff --git a/src/pqm4_hal/pinata_callbacks.h b/src/pqm4_hal/pinata_callbacks.h
new file mode 100644
index 0000000..5dce5f3
--- /dev/null
+++ b/src/pqm4_hal/pinata_callbacks.h
@@ -0,0 +1,10 @@
+#pragma once
+
+typedef void(* PINATA_PATCH_mldsa_sign_start_callback_t)();
+typedef void(* PINATA_PATCH_mldsa_sign_finish_callback_t)();
+
+extern PINATA_PATCH_mldsa_sign_start_callback_t PINATA_PATCH_mldsa_start_callback;
+extern PINATA_PATCH_mldsa_sign_finish_callback_t PINATA_PATCH_mldsa_finish_callback;
+
+PINATA_PATCH_mldsa_sign_start_callback_t PINATA_PATCH_mldsa_set_sign_start_callback(PINATA_PATCH_mldsa_sign_start_callback_t f);
+PINATA_PATCH_mldsa_sign_finish_callback_t PINATA_PATCH_mldsa_set_sign_finish_callback(PINATA_PATCH_mldsa_sign_finish_callback_t f);
diff --git a/src/pqm4_hal/randombytes.c b/src/pqm4_hal/randombytes.c
new file mode 100644
index 0000000..e94095f
--- /dev/null
+++ b/src/pqm4_hal/randombytes.c
@@ -0,0 +1,25 @@
+#include "randombytes.h"
+#include "../rng.h" // implement pqm randombytes in terms of our own random functions
+#include <string.h>
+
+static uint32_t rng_get_random_internal() {
+    while (RNG_GetFlagStatus(RNG_FLAG_DRDY) == RESET){}
+    return RNG_GetRandomNumber();
+}
+
+int randombytes(uint8_t *output, size_t n) {
+    uint32_t randomness;
+    RNG_Enable();
+    while (n >= sizeof(uint32_t)) {
+        randomness = rng_get_random_internal();
+        memcpy(output, &randomness, sizeof(uint32_t));
+        n -= sizeof(uint32_t);
+        output += sizeof(uint32_t);
+    }
+    if (n > 0) {
+        randomness = rng_get_random_internal();
+        memcpy(output, &randomness, n);
+    }
+    RNG_Disable();
+    return 0;
+}
diff --git a/src/pqm4_hal/randombytes.h b/src/pqm4_hal/randombytes.h
new file mode 100644
index 0000000..5bb3f70
--- /dev/null
+++ b/src/pqm4_hal/randombytes.h
@@ -0,0 +1,12 @@
+#pragma once
+
+// This header file must be named "randombytes".
+// The include directory of this header file should be added
+// before the mupq include directory. This way, when
+// a pqm4/mupq file includes the file "randombytes.h", it will
+// include this file.
+
+#include <stddef.h>
+#include <stdint.h>
+
+int randombytes(uint8_t *output, size_t n);
diff --git a/src/pqm4common/CMakeLists.txt b/src/pqm4common/CMakeLists.txt
deleted file mode 100644
index bf5eb5e..0000000
--- a/src/pqm4common/CMakeLists.txt
+++ /dev/null
@@ -1 +0,0 @@
-target_licensed_sources(fips202.c fips202.h keccakf1600.h keccakf1600.S)
diff --git a/src/pqm4common/fips202.c b/src/pqm4common/fips202.c
deleted file mode 100644
index 9696253..0000000
--- a/src/pqm4common/fips202.c
+++ /dev/null
@@ -1,863 +0,0 @@
-/* Based on the public domain implementation in
- * crypto_hash/keccakc512/simple/ from http://bench.cr.yp.to/supercop.html
- * by Ronny Van Keer
- * and the public domain "TweetFips202" implementation
- * from https://twitter.com/tweetfips202
- * by Gilles Van Assche, Daniel J. Bernstein, and Peter Schwabe */
-
-#include <stddef.h>
-#include <stdint.h>
-#include <string.h>
-
-#include "fips202.h"
-#include "keccakf1600.h"
-
-#define NROUNDS 24
-#define ROL(a, offset) (((a) << (offset)) ^ ((a) >> (64 - (offset))))
-
-#ifdef PROFILE_HASHING
-#include "hal.h"
-extern unsigned long long hash_cycles;
-#endif
-
-
-/*************************************************
- * Name:        keccak_absorb
- *
- * Description: Absorb step of Keccak;
- *              non-incremental, starts by zeroeing the state.
- *
- * Arguments:   - uint64_t *s:       pointer to (uninitialized) output Keccak state
- *              - uint32_t r:        rate in bytes (e.g., 168 for SHAKE128)
- *              - const uint8_t *m:  pointer to input to be absorbed into s
- *              - size_t mlen:       length of input in bytes
- *              - uint8_t p:         domain-separation byte for different Keccak-derived functions
- **************************************************/
-static void keccak_absorb(uint64_t *s,
-    uint32_t r,
-    const uint8_t *m, size_t mlen,
-    uint8_t p)
-{
-  while (mlen >= r)
-  {
-    KeccakF1600_StateXORBytes(s, m, 0, r);
-    KeccakF1600_StatePermute(s);
-    mlen -= r;
-    m += r;
-  }
-
-  if(mlen > 0){
-    KeccakF1600_StateXORBytes(s, m, 0, mlen);
-  }
-
-  if(mlen == r-1){
-    p |= 128;
-    KeccakF1600_StateXORBytes(s, &p, mlen, 1);
-  } else {
-    KeccakF1600_StateXORBytes(s, &p, mlen, 1);
-    p = 128;
-    KeccakF1600_StateXORBytes(s, &p, r-1, 1);
-  }
-}
-
-
-/*************************************************
- * Name:        keccak_squeezeblocks
- *
- * Description: Squeeze step of Keccak. Squeezes full blocks of r bytes each.
- *              Modifies the state. Can be called multiple times to keep squeezing,
- *              i.e., is incremental.
- *
- * Arguments:   - uint8_t *h:     pointer to output blocks
- *              - size_t nblocks: number of blocks to be squeezed (written to h)
- *              - uint64_t *s:    pointer to in/output Keccak state
- *              - uint32_t r:     rate in bytes (e.g., 168 for SHAKE128)
- **************************************************/
-static void keccak_squeezeblocks(uint8_t *h, size_t nblocks,
-    uint64_t *s,
-    uint32_t r)
-{
-  while(nblocks > 0)
-  {
-    KeccakF1600_StatePermute(s);
-    KeccakF1600_StateExtractBytes(s, h, 0, r);
-    h += r;
-    nblocks--;
-  }
-}
-
-/*************************************************
- * Name:        keccak_inc_init
- *
- * Description: Initializes the incremental Keccak state to zero.
- *
- * Arguments:   - uint64_t *s_inc: pointer to input/output incremental state
- *                First 25 values represent Keccak state.
- *                26th value represents either the number of absorbed bytes
- *                that have not been permuted, or not-yet-squeezed bytes.
- **************************************************/
-static void keccak_inc_init(uint64_t *s_inc) {
-    size_t i;
-
-    for (i = 0; i < 25; ++i) {
-        s_inc[i] = 0;
-    }
-    s_inc[25] = 0;
-}
-/*************************************************
- * Name:        keccak_inc_absorb
- *
- * Description: Incremental keccak absorb
- *              Preceded by keccak_inc_init, succeeded by keccak_inc_finalize
- *
- * Arguments:   - uint64_t *s_inc: pointer to input/output incremental state
- *                First 25 values represent Keccak state.
- *                26th value represents either the number of absorbed bytes
- *                that have not been permuted, or not-yet-squeezed bytes.
- *              - uint32_t r: rate in bytes (e.g., 168 for SHAKE128)
- *              - const uint8_t *m: pointer to input to be absorbed into s_inc
- *              - size_t mlen: length of input in bytes
- **************************************************/
-static void keccak_inc_absorb(uint64_t *s_inc, uint32_t r, const uint8_t *m,
-                              size_t mlen) {
-    /* Recall that s_inc[25] is the non-absorbed bytes xored into the state */
-    while (mlen + s_inc[25] >= r) {
-
-        KeccakF1600_StateXORBytes(s_inc, m, s_inc[25], r-s_inc[25]);
-        mlen -= (size_t)(r - s_inc[25]);
-        m += r - s_inc[25];
-        s_inc[25] = 0;
-
-        KeccakF1600_StatePermute(s_inc);
-    }
-
-    KeccakF1600_StateXORBytes(s_inc, m, s_inc[25], mlen);
-    s_inc[25] += mlen;
-}
-
-/*************************************************
- * Name:        keccak_inc_finalize
- *
- * Description: Finalizes Keccak absorb phase, prepares for squeezing
- *
- * Arguments:   - uint64_t *s_inc: pointer to input/output incremental state
- *                First 25 values represent Keccak state.
- *                26th value represents either the number of absorbed bytes
- *                that have not been permuted, or not-yet-squeezed bytes.
- *              - uint32_t r: rate in bytes (e.g., 168 for SHAKE128)
- *              - uint8_t p: domain-separation byte for different
- *                                 Keccak-derived functions
- **************************************************/
-static void keccak_inc_finalize(uint64_t *s_inc, uint32_t r, uint8_t p) {
-    /* After keccak_inc_absorb, we are guaranteed that s_inc[25] < r,
-       so we can always use one more byte for p in the current state. */
-    if(s_inc[25] == r-1){
-      p |= 128;
-      KeccakF1600_StateXORBytes(s_inc, &p, s_inc[25], 1);
-    } else {
-      KeccakF1600_StateXORBytes(s_inc, &p, s_inc[25], 1);
-      p = 128;
-      KeccakF1600_StateXORBytes(s_inc, &p, r-1, 1);
-    }
-    s_inc[25] = 0;
-}
-
-/*************************************************
- * Name:        keccak_inc_squeeze
- *
- * Description: Incremental Keccak squeeze; can be called on byte-level
- *
- * Arguments:   - uint8_t *h: pointer to output bytes
- *              - size_t outlen: number of bytes to be squeezed
- *              - uint64_t *s_inc: pointer to input/output incremental state
- *                First 25 values represent Keccak state.
- *                26th value represents either the number of absorbed bytes
- *                that have not been permuted, or not-yet-squeezed bytes.
- *              - uint32_t r: rate in bytes (e.g., 168 for SHAKE128)
- **************************************************/
-static void keccak_inc_squeeze(uint8_t *h, size_t outlen,
-                               uint64_t *s_inc, uint32_t r) {
-    size_t len;
-    if(outlen < s_inc[25])
-    {
-        len = outlen;
-    }
-    else
-    {
-        len = s_inc[25];
-    }
-
-    KeccakF1600_StateExtractBytes(s_inc, h, r-s_inc[25], len);
-    h += len;
-    outlen -= len;
-    s_inc[25] -= len;
-
-    /* Then squeeze the remaining necessary blocks */
-    while (outlen > 0) {
-        KeccakF1600_StatePermute(s_inc);
-
-        if(outlen < r)
-        {
-            len = outlen;
-        }
-        else
-        {
-            len = r;
-        }
-        KeccakF1600_StateExtractBytes(s_inc, h, 0, len);
-        h += len;
-        outlen -= len;
-        s_inc[25] = r - len;
-    }
-}
-
-void shake128_inc_init(shake128incctx *state) {
-#ifdef PROFILE_HASHING
-  uint64_t t0 = hal_get_time();
-#endif
-    keccak_inc_init(state->ctx);
-#ifdef PROFILE_HASHING
-  uint64_t t1 = hal_get_time();
-  hash_cycles += (t1-t0);
-#endif
-}
-
-void shake128_inc_absorb(shake128incctx *state, const uint8_t *input, size_t inlen) {
-#ifdef PROFILE_HASHING
-  uint64_t t0 = hal_get_time();
-#endif
-    keccak_inc_absorb(state->ctx, SHAKE128_RATE, input, inlen);
-#ifdef PROFILE_HASHING
-  uint64_t t1 = hal_get_time();
-  hash_cycles += (t1-t0);
-#endif
-}
-
-void shake128_inc_finalize(shake128incctx *state) {
-#ifdef PROFILE_HASHING
-  uint64_t t0 = hal_get_time();
-#endif
-    keccak_inc_finalize(state->ctx, SHAKE128_RATE, 0x1F);
-#ifdef PROFILE_HASHING
-  uint64_t t1 = hal_get_time();
-  hash_cycles += (t1-t0);
-#endif
-}
-
-void shake128_inc_squeeze(uint8_t *output, size_t outlen, shake128incctx *state) {
-#ifdef PROFILE_HASHING
-  uint64_t t0 = hal_get_time();
-#endif
-    keccak_inc_squeeze(output, outlen, state->ctx, SHAKE128_RATE);
-#ifdef PROFILE_HASHING
-  uint64_t t1 = hal_get_time();
-  hash_cycles += (t1-t0);
-#endif
-}
-
-void shake128_inc_ctx_clone(shake128incctx* dest, const shake128incctx *src) {
-    memcpy(dest, src, sizeof(shake128incctx));
-}
-
-void shake128_inc_ctx_release(shake128incctx *state) {
-    (void) state;
-}
-
-void shake256_inc_init(shake256incctx *state) {
-#ifdef PROFILE_HASHING
-  uint64_t t0 = hal_get_time();
-#endif
-    keccak_inc_init(state->ctx);
-#ifdef PROFILE_HASHING
-  uint64_t t1 = hal_get_time();
-  hash_cycles += (t1-t0);
-#endif
-}
-
-void shake256_inc_absorb(shake256incctx *state, const uint8_t *input, size_t inlen) {
-#ifdef PROFILE_HASHING
-  uint64_t t0 = hal_get_time();
-#endif
-    keccak_inc_absorb(state->ctx, SHAKE256_RATE, input, inlen);
-#ifdef PROFILE_HASHING
-  uint64_t t1 = hal_get_time();
-  hash_cycles += (t1-t0);
-#endif
-}
-
-void shake256_inc_finalize(shake256incctx *state) {
-#ifdef PROFILE_HASHING
-  uint64_t t0 = hal_get_time();
-#endif
-    keccak_inc_finalize(state->ctx, SHAKE256_RATE, 0x1F);
-#ifdef PROFILE_HASHING
-  uint64_t t1 = hal_get_time();
-  hash_cycles += (t1-t0);
-#endif
-}
-
-void shake256_inc_squeeze(uint8_t *output, size_t outlen, shake256incctx *state) {
-#ifdef PROFILE_HASHING
-  uint64_t t0 = hal_get_time();
-#endif
-    keccak_inc_squeeze(output, outlen, state->ctx, SHAKE256_RATE);
-#ifdef PROFILE_HASHING
-  uint64_t t1 = hal_get_time();
-  hash_cycles += (t1-t0);
-#endif
-}
-
-void shake256_inc_ctx_clone(shake256incctx* dest, const shake256incctx *src) {
-    memcpy(dest, src, sizeof(shake256incctx));
-}
-
-void shake256_inc_ctx_release(shake256incctx *state) {
-    (void) state;
-}
-
-/********** cSHAKE128 ***********/
-
-void cshake128_simple_absorb(shake128ctx *state, uint16_t cstm, const uint8_t *in, size_t inlen)
-{
-#ifdef PROFILE_HASHING
-  uint64_t t0 = hal_get_time();
-#endif
-
-
-  uint8_t sep[8];
-  size_t i;
-
-  for (i = 0; i < 25; i++)
-    state->ctx[i] = 0;
-
-  /* Absorb customization (domain-separation) string */
-  sep[0] = 0x01;
-  sep[1] = 0xa8;
-  sep[2] = 0x01;
-  sep[3] = 0x00;
-  sep[4] = 0x01;
-  sep[5] = 16; // fixed bitlen of cstm
-  sep[6] = cstm & 0xff;
-  sep[7] = cstm >> 8;
-
-  KeccakF1600_StateXORBytes(state->ctx, sep, 0, 8);
-  KeccakF1600_StatePermute(state->ctx);
-
-  /* Absorb input */
-  keccak_absorb(state->ctx, SHAKE128_RATE, in, inlen, 0x04);
-
-#ifdef PROFILE_HASHING
-  uint64_t t1 = hal_get_time();
-  hash_cycles += (t1-t0);
-#endif
-
-}
-
-
-void cshake128_simple_squeezeblocks(uint8_t *output, size_t nblocks, shake128ctx *state)
-{
-#ifdef PROFILE_HASHING
-  uint64_t t0 = hal_get_time();
-#endif
-  keccak_squeezeblocks(output, nblocks, state->ctx, SHAKE128_RATE);
-#ifdef PROFILE_HASHING
-  uint64_t t1 = hal_get_time();
-  hash_cycles += (t1-t0);
-#endif
-}
-
-
-void cshake128_simple(uint8_t *output, size_t outlen, uint16_t cstm, const uint8_t *in, size_t inlen)
-{
-  shake128incctx state;
-  uint8_t sep[8];
-#ifdef PROFILE_HASHING
-  uint64_t t0 = hal_get_time();
-#endif
-
-  keccak_inc_init(state.ctx);
-
-  /* Absorb customization (domain-separation) string */
-  sep[0] = 0x01;
-  sep[1] = 0xa8;
-  sep[2] = 0x01;
-  sep[3] = 0x00;
-  sep[4] = 0x01;
-  sep[5] = 16; // fixed bitlen of cstm
-  sep[6] = cstm & 0xff;
-  sep[7] = cstm >> 8;
-
-  KeccakF1600_StateXORBytes(state.ctx, sep, 0, 8);
-  KeccakF1600_StatePermute(state.ctx);
-
-  /* Absorb input */
-  keccak_inc_absorb(state.ctx, SHAKE128_RATE, in, inlen);
-  keccak_inc_finalize(state.ctx, SHAKE128_RATE, 0x04);
-
-  /* Squeeze output */
-  keccak_inc_squeeze(output, outlen, state.ctx, SHAKE128_RATE);
-#ifdef PROFILE_HASHING
-  uint64_t t1 = hal_get_time();
-  hash_cycles += (t1-t0);
-#endif
-}
-
-
-
-/*************************************************
- * Name:        shake128_absorb
- *
- * Description: Absorb step of the SHAKE128 XOF.
- *              non-incremental, starts by zeroeing the state.
- *
- * Arguments:   - uint64_t *state:      pointer to (uninitialized) output Keccak state
- *              - const uint8_t *input: pointer to input to be absorbed into state
- *              - size_t inlen:         length of input in bytes
- **************************************************/
-void shake128_absorb(shake128ctx *state, const uint8_t *input, size_t inlen)
-{
-#ifdef PROFILE_HASHING
-  uint64_t t0 = hal_get_time();
-#endif
-  int i;
-  for (i = 0; i < 25; i++)
-    state->ctx[i] = 0;
-
-  keccak_absorb(state->ctx, SHAKE128_RATE, input, inlen, 0x1F);
-#ifdef PROFILE_HASHING
-  uint64_t t1 = hal_get_time();
-  hash_cycles += (t1-t0);
-#endif
-}
-
-/*************************************************
- * Name:        shake128_squeezeblocks
- *
- * Description: Squeeze step of SHAKE128 XOF. Squeezes full blocks of SHAKE128_RATE bytes each.
- *              Modifies the state. Can be called multiple times to keep squeezing,
- *              i.e., is incremental.
- *
- * Arguments:   - uint8_t *output:     pointer to output blocks
- *              - size_t nblocks:      number of blocks to be squeezed (written to output)
- *              - shake128ctx *state:  pointer to in/output Keccak state
- **************************************************/
-void shake128_squeezeblocks(uint8_t *output, size_t nblocks, shake128ctx *state)
-{
-#ifdef PROFILE_HASHING
-  uint64_t t0 = hal_get_time();
-#endif
-  keccak_squeezeblocks(output, nblocks, state->ctx, SHAKE128_RATE);
-#ifdef PROFILE_HASHING
-  uint64_t t1 = hal_get_time();
-  hash_cycles += (t1-t0);
-#endif
-}
-
-void shake128(uint8_t *output, size_t outlen, const uint8_t *input, size_t inlen)
-{
-#ifdef PROFILE_HASHING
-  uint64_t t0 = hal_get_time();
-#endif
-  shake128incctx state;
-
-  keccak_inc_init(state.ctx);
-
-  /* Absorb input */
-  keccak_inc_absorb(state.ctx, SHAKE128_RATE, input, inlen);
-  keccak_inc_finalize(state.ctx, SHAKE128_RATE, 0x1F);
-
-  /* Squeeze output */
-  keccak_inc_squeeze(output, outlen, state.ctx, SHAKE128_RATE);
-#ifdef PROFILE_HASHING
-  uint64_t t1 = hal_get_time();
-  hash_cycles += (t1-t0);
-#endif
-}
-
-void shake128_ctx_release(shake128ctx *state) {
-    (void) state;
-}
-void shake128_ctx_clone(shake128ctx *dest, const shake128ctx *src) {
-    memcpy(dest, src, sizeof(shake128ctx));
-}
-
-void shake256_absorb(shake256ctx *state, const uint8_t *input, size_t inlen)
-{
-#ifdef PROFILE_HASHING
-  uint64_t t0 = hal_get_time();
-#endif
-  int i;
-  for (i = 0; i < 25; i++)
-    state->ctx[i] = 0;
-
-  keccak_absorb(state->ctx, SHAKE256_RATE, input, inlen, 0x1F);
-#ifdef PROFILE_HASHING
-  uint64_t t1 = hal_get_time();
-  hash_cycles += (t1-t0);
-#endif
-}
-
-
-void shake256_squeezeblocks(uint8_t *output, size_t nblocks, shake256ctx *state)
-{
-#ifdef PROFILE_HASHING
-  uint64_t t0 = hal_get_time();
-#endif
-  keccak_squeezeblocks(output, nblocks, state->ctx, SHAKE256_RATE);
-#ifdef PROFILE_HASHING
-  uint64_t t1 = hal_get_time();
-  hash_cycles += (t1-t0);
-#endif
-}
-
-/*************************************************
- * Name:        shake256
- *
- * Description: SHAKE256 XOF with non-incremental API
- *
- * Arguments:   - uint8_t *output:      pointer to output
- *              - size_t outlen:        requested output length in bytes
- *              - const uint8_t *input: pointer to input
- *              - size_t inlen:         length of input in bytes
- **************************************************/
-void shake256(uint8_t *output, size_t outlen,
-    const uint8_t *input, size_t inlen)
-{
-#ifdef PROFILE_HASHING
-  uint64_t t0 = hal_get_time();
-#endif
-  shake256incctx state;
-
-  keccak_inc_init(state.ctx);
-
-  /* Absorb input */
-  keccak_inc_absorb(state.ctx, SHAKE256_RATE, input, inlen);
-  keccak_inc_finalize(state.ctx, SHAKE256_RATE, 0x1F);
-
-  /* Squeeze output */
-  keccak_inc_squeeze(output, outlen, state.ctx, SHAKE256_RATE);
-#ifdef PROFILE_HASHING
-  uint64_t t1 = hal_get_time();
-  hash_cycles += (t1-t0);
-#endif
-}
-
-void shake256_ctx_release(shake256ctx *state) {
-    (void) state;
-}
-
-void shake256_ctx_clone(shake256ctx *dest, const shake256ctx *src) {
-    memcpy(dest, src, sizeof(shake256ctx));
-}
-
-
-/*************************************************
- * Name:        sha3_256
- *
- * Description: SHA3-256 with non-incremental API
- *
- * Arguments:   - uint8_t *output:      pointer to output
- *              - const uint8_t *input: pointer to input
- *              - size_t inlen:         length of input in bytes
- **************************************************/
-void sha3_256(uint8_t *output, const uint8_t *input, size_t inlen)
-{
-#ifdef PROFILE_HASHING
-  uint64_t t0 = hal_get_time();
-#endif
-  sha3_256incctx state;
-  keccak_inc_init(state.ctx);
-
-  /* Absorb input */
-  keccak_inc_absorb(state.ctx, SHA3_256_RATE, input, inlen);
-  keccak_inc_finalize(state.ctx, SHA3_256_RATE, 0x06);
-
-  /* Squeeze output */
-  keccak_inc_squeeze(output, 32, state.ctx, SHA3_256_RATE);
-#ifdef PROFILE_HASHING
-  uint64_t t1 = hal_get_time();
-  hash_cycles += (t1-t0);
-#endif
-}
-void sha3_256_inc_init(sha3_256incctx *state) {
-#ifdef PROFILE_HASHING
-  uint64_t t0 = hal_get_time();
-#endif
-    keccak_inc_init(state->ctx);
-#ifdef PROFILE_HASHING
-  uint64_t t1 = hal_get_time();
-  hash_cycles += (t1-t0);
-#endif
-}
-
-void sha3_256_inc_absorb(sha3_256incctx *state, const uint8_t *input, size_t inlen) {
-#ifdef PROFILE_HASHING
-  uint64_t t0 = hal_get_time();
-#endif
-    keccak_inc_absorb(state->ctx, SHA3_256_RATE, input, inlen);
-#ifdef PROFILE_HASHING
-  uint64_t t1 = hal_get_time();
-  hash_cycles += (t1-t0);
-#endif
-}
-
-void sha3_256_inc_finalize(uint8_t *output, sha3_256incctx *state) {
-#ifdef PROFILE_HASHING
-  uint64_t t0 = hal_get_time();
-#endif
-    uint8_t t[SHA3_256_RATE];
-    keccak_inc_finalize(state->ctx, SHA3_256_RATE, 0x06);
-
-    keccak_squeezeblocks(t, 1, state->ctx, SHA3_256_RATE);
-
-    for (size_t i = 0; i < 32; i++) {
-        output[i] = t[i];
-    }
-#ifdef PROFILE_HASHING
-  uint64_t t1 = hal_get_time();
-  hash_cycles += (t1-t0);
-#endif
-}
-
-void sha3_256_inc_ctx_clone(sha3_256incctx *dest, const sha3_256incctx *src) {
-    memcpy(dest, src, sizeof(sha3_256incctx));
-}
-
-void sha3_256_inc_ctx_release(sha3_256incctx *state) {
-    (void) state;
-}
-
-void sha3_384_inc_init(sha3_384incctx *state) {
-#ifdef PROFILE_HASHING
-  uint64_t t0 = hal_get_time();
-#endif
-    keccak_inc_init(state->ctx);
-#ifdef PROFILE_HASHING
-  uint64_t t1 = hal_get_time();
-  hash_cycles += (t1-t0);
-#endif
-}
-
-void sha3_384_inc_absorb(sha3_384incctx *state, const uint8_t *input, size_t inlen) {
-#ifdef PROFILE_HASHING
-  uint64_t t0 = hal_get_time();
-#endif
-    keccak_inc_absorb(state->ctx, SHA3_384_RATE, input, inlen);
-#ifdef PROFILE_HASHING
-  uint64_t t1 = hal_get_time();
-  hash_cycles += (t1-t0);
-#endif
-}
-
-void sha3_384_inc_finalize(uint8_t *output, sha3_384incctx *state) {
-#ifdef PROFILE_HASHING
-  uint64_t t0 = hal_get_time();
-#endif
-    uint8_t t[SHA3_384_RATE];
-    keccak_inc_finalize(state->ctx, SHA3_384_RATE, 0x06);
-
-    keccak_squeezeblocks(t, 1, state->ctx, SHA3_384_RATE);
-
-    for (size_t i = 0; i < 48; i++) {
-        output[i] = t[i];
-    }
-#ifdef PROFILE_HASHING
-  uint64_t t1 = hal_get_time();
-  hash_cycles += (t1-t0);
-#endif
-}
-
-void sha3_384_inc_ctx_clone(sha3_384incctx *dest, const sha3_384incctx *src) {
-    memcpy(dest, src, sizeof(sha3_384incctx));
-}
-
-void sha3_384_inc_ctx_release(sha3_384incctx *state) {
-    (void) state;
-}
-
-/*************************************************
- * Name:        sha3_384
- *
- * Description: SHA3-256 with non-incremental API
- *
- * Arguments:   - uint8_t *output:      pointer to output
- *              - const uint8_t *input: pointer to input
- *              - size_t inlen:   length of input in bytes
- **************************************************/
-void sha3_384(uint8_t *output, const uint8_t *input, size_t inlen) {
-#ifdef PROFILE_HASHING
-  uint64_t t0 = hal_get_time();
-#endif
-  sha3_384incctx state;
-  keccak_inc_init(state.ctx);
-
-  /* Absorb input */
-  keccak_inc_absorb(state.ctx, SHA3_384_RATE, input, inlen);
-  keccak_inc_finalize(state.ctx, SHA3_384_RATE, 0x06);
-
-  /* Squeeze output */
-  keccak_inc_squeeze(output, 48, state.ctx, SHA3_384_RATE);
-#ifdef PROFILE_HASHING
-  uint64_t t1 = hal_get_time();
-  hash_cycles += (t1-t0);
-#endif
-}
-/*************************************************
- * Name:        sha3_512
- *
- * Description: SHA3-512 with non-incremental API
- *
- * Arguments:   - uint8_t *output:      pointer to output
- *              - const uint8_t *input: pointer to input
- *              - size_t inlen:         length of input in bytes
- **************************************************/
-void sha3_512(uint8_t *output, const uint8_t *input, size_t inlen)
-{
-#ifdef PROFILE_HASHING
-  uint64_t t0 = hal_get_time();
-#endif
-  sha3_512incctx state;
-  keccak_inc_init(state.ctx);
-
-  /* Absorb input */
-  keccak_inc_absorb(state.ctx, SHA3_512_RATE, input, inlen);
-  keccak_inc_finalize(state.ctx, SHA3_512_RATE, 0x06);
-
-  /* Squeeze output */
-  keccak_inc_squeeze(output, 64, state.ctx, SHA3_512_RATE);
-#ifdef PROFILE_HASHING
-  uint64_t t1 = hal_get_time();
-  hash_cycles += (t1-t0);
-#endif
-}
-void sha3_512_inc_init(sha3_512incctx *state) {
-#ifdef PROFILE_HASHING
-  uint64_t t0 = hal_get_time();
-#endif
-    keccak_inc_init(state->ctx);
-#ifdef PROFILE_HASHING
-  uint64_t t1 = hal_get_time();
-  hash_cycles += (t1-t0);
-#endif
-}
-
-void sha3_512_inc_absorb(sha3_512incctx *state, const uint8_t *input, size_t inlen) {
-#ifdef PROFILE_HASHING
-  uint64_t t0 = hal_get_time();
-#endif
-    keccak_inc_absorb(state->ctx, SHA3_512_RATE, input, inlen);
-#ifdef PROFILE_HASHING
-  uint64_t t1 = hal_get_time();
-  hash_cycles += (t1-t0);
-#endif
-}
-
-void sha3_512_inc_finalize(uint8_t *output, sha3_512incctx *state) {
-#ifdef PROFILE_HASHING
-  uint64_t t0 = hal_get_time();
-#endif
-    uint8_t t[SHA3_512_RATE];
-    keccak_inc_finalize(state->ctx, SHA3_512_RATE, 0x06);
-
-    keccak_squeezeblocks(t, 1, state->ctx, SHA3_512_RATE);
-
-    for (size_t i = 0; i < 64; i++) {
-        output[i] = t[i];
-    }
-#ifdef PROFILE_HASHING
-  uint64_t t1 = hal_get_time();
-  hash_cycles += (t1-t0);
-#endif
-}
-
-void sha3_512_inc_ctx_clone(sha3_512incctx *dest, const sha3_512incctx *src) {
-    memcpy(dest, src, sizeof(sha3_512incctx));
-}
-
-void sha3_512_inc_ctx_release(sha3_512incctx *state) {
-    (void) state;
-}
-
-/********** cSHAKE256 ***********/
-
-void cshake256_simple_absorb(shake256ctx *state, uint16_t cstm, const uint8_t *in, size_t inlen)
-{
-#ifdef PROFILE_HASHING
-  uint64_t t0 = hal_get_time();
-#endif
-  uint8_t sep[8];
-  size_t i;
-
-  for (i = 0; i < 25; i++)
-    state->ctx[i] = 0;
-
-  /* Absorb customization (domain-separation) string */
-  sep[0] = 0x01;
-  sep[1] = 0x88;
-  sep[2] = 0x01;
-  sep[3] = 0x00;
-  sep[4] = 0x01;
-  sep[5] = 16; // fixed bitlen of cstm
-  sep[6] = cstm & 0xff;
-  sep[7] = cstm >> 8;
-
-  KeccakF1600_StateXORBytes(state->ctx, sep, 0, 8);
-  KeccakF1600_StatePermute(state->ctx);
-
-  /* Absorb input */
-  keccak_absorb(state->ctx, SHAKE256_RATE, in, inlen, 0x04);
-#ifdef PROFILE_HASHING
-  uint64_t t1 = hal_get_time();
-  hash_cycles += (t1-t0);
-#endif
-}
-
-
-void cshake256_simple_squeezeblocks(uint8_t *output, size_t nblocks, shake256ctx *state)
-{
-#ifdef PROFILE_HASHING
-  uint64_t t0 = hal_get_time();
-#endif
-  keccak_squeezeblocks(output, nblocks, state->ctx, SHAKE256_RATE);
-#ifdef PROFILE_HASHING
-  uint64_t t1 = hal_get_time();
-  hash_cycles += (t1-t0);
-#endif
-}
-
-
-void cshake256_simple(uint8_t *output, size_t outlen, uint16_t cstm, const uint8_t *in, size_t inlen)
-{
-  shake256incctx state;
-  uint8_t sep[8];
-  #ifdef PROFILE_HASHING
-    uint64_t t0 = hal_get_time();
-  #endif
-
-
-  keccak_inc_init(state.ctx);
-
-  /* Absorb customization (domain-separation) string */
-  sep[0] = 0x01;
-  sep[1] = 0x88;
-  sep[2] = 0x01;
-  sep[3] = 0x00;
-  sep[4] = 0x01;
-  sep[5] = 16; // fixed bitlen of cstm
-  sep[6] = cstm & 0xff;
-  sep[7] = cstm >> 8;
-
-  KeccakF1600_StateXORBytes(state.ctx, sep, 0, 8);
-  KeccakF1600_StatePermute(state.ctx);
-
-  /* Absorb input */
-  keccak_inc_absorb(state.ctx, SHAKE256_RATE, in, inlen);
-  keccak_inc_finalize(state.ctx, SHAKE256_RATE, 0x04);
-
-  /* Squeeze output */
-  keccak_inc_squeeze(output, outlen, state.ctx, SHAKE256_RATE);
-#ifdef PROFILE_HASHING
-  uint64_t t1 = hal_get_time();
-  hash_cycles += (t1-t0);
-#endif
-}
diff --git a/src/pqm4common/fips202.h b/src/pqm4common/fips202.h
deleted file mode 100644
index c8dc22f..0000000
--- a/src/pqm4common/fips202.h
+++ /dev/null
@@ -1,174 +0,0 @@
-#ifndef FIPS202_H
-#define FIPS202_H
-
-#include <stddef.h>
-#include <stdint.h>
-
-#define SHAKE128_RATE 168
-#define SHAKE256_RATE 136
-#define SHA3_256_RATE 136
-#define SHA3_384_RATE 104
-#define SHA3_512_RATE 72
-
-
-// Context for incremental API
-typedef struct {
-    uint64_t ctx[26];
-} shake128incctx;
-
-// Context for non-incremental API
-typedef struct {
-    uint64_t ctx[25];
-} shake128ctx;
-
-// Context for incremental API
-typedef struct {
-    uint64_t ctx[26];
-} shake256incctx;
-
-// Context for non-incremental API
-typedef struct {
-    uint64_t ctx[25];
-} shake256ctx;
-
-// Context for incremental API
-typedef struct {
-    uint64_t ctx[26];
-} sha3_256incctx;
-
-// Context for incremental API
-typedef struct {
-    uint64_t ctx[26];
-} sha3_384incctx;
-
-// Context for incremental API
-typedef struct {
-    uint64_t ctx[26];
-} sha3_512incctx;
-
-/* Initialize the state and absorb the provided input.
- *
- * This function does not support being called multiple times
- * with the same state.
- */
-void shake128_absorb(shake128ctx *state, const uint8_t *input, size_t inlen);
-/* Squeeze output out of the sponge.
- *
- * Supports being called multiple times
- */
-void shake128_squeezeblocks(uint8_t *output, size_t nblocks, shake128ctx *state);
-/* Free the state */
-void shake128_ctx_release(shake128ctx *state);
-/* Copy the state. */
-void shake128_ctx_clone(shake128ctx *dest, const shake128ctx *src);
-
-void cshake128_simple_absorb(shake128ctx *state, uint16_t cstm, const uint8_t *input, size_t inlen);
-void cshake128_simple_squeezeblocks(uint8_t *output, size_t nblocks, shake128ctx *state);
-void cshake128_simple(uint8_t *output, size_t outlen, uint16_t cstm, const uint8_t *input, size_t inlen);
-
-/* Initialize incremental hashing API */
-void shake128_inc_init(shake128incctx *state);
-/* Absorb more information into the XOF.
- *
- * Can be called multiple times.
- */
-void shake128_inc_absorb(shake128incctx *state, const uint8_t *input, size_t inlen);
-/* Finalize the XOF for squeezing */
-void shake128_inc_finalize(shake128incctx *state);
-/* Squeeze output out of the sponge.
- *
- * Supports being called multiple times
- */
-void shake128_inc_squeeze(uint8_t *output, size_t outlen, shake128incctx *state);
-/* Copy the context of the SHAKE128 XOF */
-void shake128_inc_ctx_clone(shake128incctx* dest, const shake128incctx *src);
-/* Free the context of the SHAKE128 XOF */
-void shake128_inc_ctx_release(shake128incctx *state);
-
-/* Initialize the state and absorb the provided input.
- *
- * This function does not support being called multiple times
- * with the same state.
- */
-void shake256_absorb(shake256ctx *state, const uint8_t *input, size_t inlen);
-/* Squeeze output out of the sponge.
- *
- * Supports being called multiple times
- */
-void shake256_squeezeblocks(uint8_t *output, size_t nblocks, shake256ctx *state);
-/* Free the context held by this XOF */
-void shake256_ctx_release(shake256ctx *state);
-/* Copy the context held by this XOF */
-void shake256_ctx_clone(shake256ctx *dest, const shake256ctx *src);
-
-void cshake256_simple_absorb(shake256ctx *state, uint16_t cstm, const uint8_t *input, size_t inlen);
-void cshake256_simple_squeezeblocks(uint8_t *output, size_t nblocks, shake256ctx *state);
-void cshake256_simple(uint8_t *output, size_t outlen, uint16_t cstm, const uint8_t *input, size_t inlen);
-
-/* Initialize incremental hashing API */
-void shake256_inc_init(shake256incctx *state);
-void shake256_inc_absorb(shake256incctx *state, const uint8_t *input, size_t inlen);
-/* Prepares for squeeze phase */
-void shake256_inc_finalize(shake256incctx *state);
-
-
-
-/* Squeeze output out of the sponge.
- *
- * Supports being called multiple times
- */
-void shake256_inc_squeeze(uint8_t *output, size_t outlen, shake256incctx *state);
-/* Copy the state */
-void shake256_inc_ctx_clone(shake256incctx* dest, const shake256incctx *src);
-/* Free the state */
-void shake256_inc_ctx_release(shake256incctx *state);
-
-/* One-stop SHAKE128 call */
-void shake128(uint8_t *output, size_t outlen,
-              const uint8_t *input, size_t inlen);
-
-/* One-stop SHAKE256 call */
-void shake256(uint8_t *output, size_t outlen,
-              const uint8_t *input, size_t inlen);
-
-/* Initialize the incremental hashing state */
-void sha3_256_inc_init(sha3_256incctx *state);
-/* Absorb blocks into SHA3 */
-void sha3_256_inc_absorb(sha3_256incctx *state, const uint8_t *input, size_t inlen);
-/* Obtain the output of the function and free `state` */
-void sha3_256_inc_finalize(uint8_t *output, sha3_256incctx *state);
-/* Copy the context */
-void sha3_256_inc_ctx_clone(sha3_256incctx *dest, const sha3_256incctx *src);
-/* Release the state, don't use if `_finalize` has been used */
-void sha3_256_inc_ctx_release(sha3_256incctx *state);
-
-void sha3_256(uint8_t *output, const uint8_t *input, size_t inlen);
-
-/* Initialize the incremental hashing state */
-void sha3_384_inc_init(sha3_384incctx *state);
-/* Absorb blocks into SHA3 */
-void sha3_384_inc_absorb(sha3_384incctx *state, const uint8_t *input, size_t inlen);
-/* Obtain the output of the function and free `state` */
-void sha3_384_inc_finalize(uint8_t *output, sha3_384incctx *state);
-/* Copy the context */
-void sha3_384_inc_ctx_clone(sha3_384incctx *dest, const sha3_384incctx *src);
-/* Release the state, don't use if `_finalize` has been used */
-void sha3_384_inc_ctx_release(sha3_384incctx *state);
-
-/* One-stop SHA3-384 shop */
-void sha3_384(uint8_t *output, const uint8_t *input, size_t inlen);
-
-/* Initialize the incremental hashing state */
-void sha3_512_inc_init(sha3_512incctx *state);
-/* Absorb blocks into SHA3 */
-void sha3_512_inc_absorb(sha3_512incctx *state, const uint8_t *input, size_t inlen);
-/* Obtain the output of the function and free `state` */
-void sha3_512_inc_finalize(uint8_t *output, sha3_512incctx *state);
-/* Copy the context */
-void sha3_512_inc_ctx_clone(sha3_512incctx *dest, const sha3_512incctx *src);
-/* Release the state, don't use if `_finalize` has been used */
-void sha3_512_inc_ctx_release(sha3_512incctx *state);
-
-/* One-stop SHA3-512 shop */
-void sha3_512(uint8_t *output, const uint8_t *input, size_t inlen);
-#endif
diff --git a/src/pqm4common/keccakf1600.S b/src/pqm4common/keccakf1600.S
deleted file mode 100644
index 3a6f044..0000000
--- a/src/pqm4common/keccakf1600.S
+++ /dev/null
@@ -1,736 +0,0 @@
-@
-@ Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
-@ Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
-@ denoted as "the implementer".
-@
-@ For more information, feedback or questions, please refer to our websites:
-@ http://keccak.noekeon.org/
-@ http://keyak.noekeon.org/
-@ http://ketje.noekeon.org/
-@
-@ To the extent possible under law, the implementer has waived all copyright
-@ and related or neighboring rights to the source code in this file.
-@ http://creativecommons.org/publicdomain/zero/1.0/
-@
-
-@ WARNING: These functions work only on little endian CPU with@ ARMv7m architecture (ARM Cortex-M3, ...).
-
-
-	.thumb
-	.syntax unified
-.text
-
-	@ Credit: Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002
-.macro	toBitInterleaving	x0,x1,s0,s1,t,over
-
-	and		\t,\x0,#0x55555555
-	orr		\t,\t,\t, LSR #1
-	and		\t,\t,#0x33333333
-	orr		\t,\t,\t, LSR #2
-	and		\t,\t,#0x0F0F0F0F
-	orr		\t,\t,\t, LSR #4
-	and		\t,\t,#0x00FF00FF
-	bfi		\t,\t,#8, #8
-	.if \over != 0
-	lsr		\s0,\t, #8
-	.else
-	eor		\s0,\s0,\t, LSR #8
-	.endif
-
-	and		\t,\x1,#0x55555555
-	orr		\t,\t,\t, LSR #1
-	and		\t,\t,#0x33333333
-	orr		\t,\t,\t, LSR #2
-	and		\t,\t,#0x0F0F0F0F
-	orr		\t,\t,\t, LSR #4
-	and		\t,\t,#0x00FF00FF
-	orr		\t,\t,\t, LSR #8
-	eor		\s0,\s0,\t, LSL #16
-
-	and		\t,\x0,#0xAAAAAAAA
-	orr		\t,\t,\t, LSL #1
-	and		\t,\t,#0xCCCCCCCC
-	orr		\t,\t,\t, LSL #2
-	and		\t,\t,#0xF0F0F0F0
-	orr		\t,\t,\t, LSL #4
-	and		\t,\t,#0xFF00FF00
-	orr		\t,\t,\t, LSL #8
-	.if \over != 0
-	lsr		\s1,\t, #16
-	.else
-	eor		\s1,\s1,\t, LSR #16
-	.endif
-
-	and		\t,\x1,#0xAAAAAAAA
-	orr		\t,\t,\t, LSL #1
-	and		\t,\t,#0xCCCCCCCC
-	orr		\t,\t,\t, LSL #2
-	and		\t,\t,#0xF0F0F0F0
-	orr		\t,\t,\t, LSL #4
-	and		\t,\t,#0xFF00FF00
-	orr		\t,\t,\t, LSL #8
-	bfc		\t, #0, #16
-	eors	\s1,\s1,\t
-	.endm
-
-	@ Credit: Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002
-.macro	fromBitInterleaving		x0, x1, t
-
-	movs	\t, \x0					@ t = x0@
-	bfi		\x0, \x1, #16, #16		@ x0 = (x0 & 0x0000FFFF) | (x1 << 16)@
-	bfc		\x1, #0, #16			@	x1 = (t >> 16) | (x1 & 0xFFFF0000)@
-	orr		\x1, \x1, \t, LSR #16
-
-    eor		\t, \x0, \x0, LSR #8    @ t = (x0 ^ (x0 >>  8)) & 0x0000FF00UL@  x0 = x0 ^ t ^ (t <<  8)@
-	and		\t, #0x0000FF00
-    eors	\x0, \x0, \t
-    eor		\x0, \x0, \t, LSL #8
-
-    eor		\t, \x0, \x0, LSR #4	@ t = (x0 ^ (x0 >>  4)) & 0x00F000F0UL@  x0 = x0 ^ t ^ (t <<  4)@
-	and		\t, #0x00F000F0
-    eors	\x0, \x0, \t
-    eor		\x0, \x0, \t, LSL #4
-
-    eor		\t, \x0, \x0, LSR #2	@ t = (x0 ^ (x0 >>  2)) & 0x0C0C0C0CUL@  x0 = x0 ^ t ^ (t <<  2)@
-	and		\t, #0x0C0C0C0C
-    eors	\x0, \x0, \t
-    eor		\x0, \x0, \t, LSL #2
-
-    eor		\t, \x0, \x0, LSR #1	@ t = (x0 ^ (x0 >>  1)) & 0x22222222UL@  x0 = x0 ^ t ^ (t <<  1)@
-	and		\t, #0x22222222
-    eors	\x0, \x0, \t
-    eor		\x0, \x0, \t, LSL #1
-
-    eor		\t, \x1, \x1, LSR #8    @ t = (x1 ^ (x1 >>  8)) & 0x0000FF00UL@  x1 = x1 ^ t ^ (t <<  8)@
-	and		\t, #0x0000FF00
-    eors	\x1, \x1, \t
-    eor		\x1, \x1, \t, LSL #8
-
-    eor		\t, \x1, \x1, LSR #4	@ t = (x1 ^ (x1 >>  4)) & 0x00F000F0UL@  x1 = x1 ^ t ^ (t <<  4)@
-	and		\t, #0x00F000F0
-    eors	\x1, \x1, \t
-    eor		\x1, \x1, \t, LSL #4
-
-    eor		\t, \x1, \x1, LSR #2	@ t = (x1 ^ (x1 >>  2)) & 0x0C0C0C0CUL@  x1 = x1 ^ t ^ (t <<  2)@
-	and		\t, #0x0C0C0C0C
-    eors	\x1, \x1, \t
-    eor		\x1, \x1, \t, LSL #2
-
-    eor		\t, \x1, \x1, LSR #1	@ t = (x1 ^ (x1 >>  1)) & 0x22222222UL@  x1 = x1 ^ t ^ (t <<  1)@
-	and		\t, #0x22222222
-    eors	\x1, \x1, \t
-    eor		\x1, \x1, \t, LSL #1
-	.endm
-
-@	--- offsets in state
-.equ Aba0, 0*4
-.equ Aba1, 1*4
-.equ Abe0, 2*4
-.equ Abe1, 3*4
-.equ Abi0, 4*4
-.equ Abi1, 5*4
-.equ Abo0, 6*4
-.equ Abo1, 7*4
-.equ Abu0, 8*4
-.equ Abu1, 9*4
-.equ Aga0, 10*4
-.equ Aga1, 11*4
-.equ Age0, 12*4
-.equ Age1, 13*4
-.equ Agi0, 14*4
-.equ Agi1, 15*4
-.equ Ago0, 16*4
-.equ Ago1, 17*4
-.equ Agu0, 18*4
-.equ Agu1, 19*4
-.equ Aka0, 20*4
-.equ Aka1, 21*4
-.equ Ake0, 22*4
-.equ Ake1, 23*4
-.equ Aki0, 24*4
-.equ Aki1, 25*4
-.equ Ako0, 26*4
-.equ Ako1, 27*4
-.equ Aku0, 28*4
-.equ Aku1, 29*4
-.equ Ama0, 30*4
-.equ Ama1, 31*4
-.equ Ame0, 32*4
-.equ Ame1, 33*4
-.equ Ami0, 34*4
-.equ Ami1, 35*4
-.equ Amo0, 36*4
-.equ Amo1, 37*4
-.equ Amu0, 38*4
-.equ Amu1, 39*4
-.equ Asa0, 40*4
-.equ Asa1, 41*4
-.equ Ase0, 42*4
-.equ Ase1, 43*4
-.equ Asi0, 44*4
-.equ Asi1, 45*4
-.equ Aso0, 46*4
-.equ Aso1, 47*4
-.equ Asu0, 48*4
-.equ Asu1, 49*4
-
-@	--- offsets on stack
-.equ mDa0, 0*4
-.equ mDa1, 1*4
-.equ mDo0, 2*4
-.equ mDo1, 3*4
-.equ mDi0, 4*4
-.equ mRC	, 5*4
-.equ mSize, 6*4
-
-
-.macro	xor5		result,b,g,k,m,s
-
-	ldr			\result, [r0, #\b]
-	ldr			r1, [r0, #\g]
-	eors		\result, \result, r1
-	ldr			r1, [r0, #\k]
-	eors		\result, \result, r1
-	ldr			r1, [r0, #\m]
-	eors		\result, \result, r1
-	ldr			r1, [r0, #\s]
-	eors		\result, \result, r1
-	.endm
-
-.macro	xorrol 		result, aa, bb
-
-	eor			\result, \aa, \bb, ROR #31
-	.endm
-
-.macro	xandnot 	resofs, aa, bb, cc
-
-	bic			r1, \cc, \bb
-	eors		r1, r1, \aa
-	str			r1, [r0, #\resofs]
-	.endm
-
-.macro	KeccakThetaRhoPiChiIota aA1, aDax, aA2, aDex, rot2, aA3, aDix, rot3, aA4, aDox, rot4, aA5, aDux, rot5, offset, last
-	ldr		r3, [r0, #\aA1]
-	ldr		r4, [r0, #\aA2]
-	ldr		r5, [r0, #\aA3]
-	ldr		r6, [r0, #\aA4]
-	ldr		r7, [r0, #\aA5]
-	eors	r3, r3, \aDax
-	eors	r5, r5, \aDix
-	eors	r4, r4, \aDex
-	eors	r6, r6, \aDox
-	eors	r7, r7, \aDux
-	rors	r4, #32-\rot2
-	rors	r5, #32-\rot3
-	rors	r6, #32-\rot4
-	rors	r7, #32-\rot5
-    xandnot \aA2, r4, r5, r6
-    xandnot \aA3, r5, r6, r7
-    xandnot \aA4, r6, r7, r3
-    xandnot \aA5, r7, r3, r4
-	ldr		r1, [sp, #mRC]
-	bics	r5, r5, r4
-	ldr		r4, [r1, #\offset]
-	eors	r3, r3, r5
-	eors	r3, r3, r4
-	.if	\last == 1
-	ldr		r4, [r1, #32]!
-	str		r1, [sp, #mRC]
-	cmp		r4, #0xFF
-	.endif
-	str		r3, [r0, #\aA1]
-	.endm
-
-.macro	KeccakThetaRhoPiChi aB1, aA1, aDax, rot1, aB2, aA2, aDex, rot2, aB3, aA3, aDix, rot3, aB4, aA4, aDox, rot4, aB5, aA5, aDux, rot5
-	ldr		\aB1, [r0, #\aA1]
-	ldr		\aB2, [r0, #\aA2]
-	ldr		\aB3, [r0, #\aA3]
-	ldr		\aB4, [r0, #\aA4]
-	ldr		\aB5, [r0, #\aA5]
-	eors	\aB1, \aB1, \aDax
-	eors	\aB3, \aB3, \aDix
-	eors	\aB2, \aB2, \aDex
-	eors	\aB4, \aB4, \aDox
-	eors	\aB5, \aB5, \aDux
-	rors	\aB1, #32-\rot1
-	.if	\rot2 > 0
-	rors	\aB2, #32-\rot2
-	.endif
-	rors	\aB3, #32-\rot3
-	rors	\aB4, #32-\rot4
-	rors	\aB5, #32-\rot5
-	xandnot \aA1, r3, r4, r5
-    xandnot \aA2, r4, r5, r6
-    xandnot \aA3, r5, r6, r7
-    xandnot \aA4, r6, r7, r3
-    xandnot \aA5, r7, r3, r4
-	.endm
-
-.macro	KeccakRound0
-
-	xor5        r3,  Abu0, Agu0, Aku0, Amu0, Asu0
-	xor5        r7, Abe1, Age1, Ake1, Ame1, Ase1
-	xorrol      r6, r3, r7
-	str			r6, [sp, #mDa0]
-	xor5        r6,  Abu1, Agu1, Aku1, Amu1, Asu1
-	xor5        lr, Abe0, Age0, Ake0, Ame0, Ase0
-	eors        r8, r6, lr
-	str			r8, [sp, #mDa1]
-
-	xor5        r5,  Abi0, Agi0, Aki0, Ami0, Asi0
-	xorrol      r9, r5, r6
-	str			r9, [sp, #mDo0]
-	xor5        r4,  Abi1, Agi1, Aki1, Ami1, Asi1
-	eors		r3, r3, r4
-	str			r3, [sp, #mDo1]
-
-	xor5        r3,  Aba0, Aga0, Aka0, Ama0, Asa0
-	xorrol      r10, r3, r4
-	xor5        r6,  Aba1, Aga1, Aka1, Ama1, Asa1
-	eors        r11, r6, r5
-
-	xor5        r4,  Abo1, Ago1, Ako1, Amo1, Aso1
-	xorrol      r5, lr, r4
-	str			r5, [sp, #mDi0]
-	xor5        r5,  Abo0, Ago0, Ako0, Amo0, Aso0
-	eors        r2, r7, r5
-
-	xorrol      r12, r5, r6
-	eors        lr, r4, r3
-
-	KeccakThetaRhoPiChi r5, Aka1, r8,  2, r6, Ame1, r11, 23, r7, Asi1, r2, 31, r3, Abo0, r9, 14, r4, Agu0, r12, 10
-	KeccakThetaRhoPiChi r7, Asa1, r8,  9, r3, Abe0, r10,  0, r4, Agi1, r2,  3, r5, Ako0, r9, 12, r6, Amu1, lr,  4
-	ldr			r8, [sp, #mDa0]
-	KeccakThetaRhoPiChi r4, Aga0, r8, 18, r5, Ake0, r10,  5, r6, Ami1, r2,  8, r7, Aso0, r9, 28, r3, Abu1, lr, 14
-	KeccakThetaRhoPiChi r6, Ama0, r8, 20, r7, Ase1, r11,  1, r3, Abi1, r2, 31, r4, Ago0, r9, 27, r5, Aku0, r12, 19
-	ldr			r9, [sp, #mDo1]
-	KeccakThetaRhoPiChiIota  Aba0, r8,          Age0, r10, 22,      Aki1, r2, 22,      Amo1, r9, 11,      Asu0, r12,  7, 0, 0
-
-	ldr			r2, [sp, #mDi0]
-	KeccakThetaRhoPiChi r5, Aka0, r8,  1, r6, Ame0, r10, 22, r7, Asi0, r2, 30, r3, Abo1, r9, 14, r4, Agu1, lr, 10
-	KeccakThetaRhoPiChi r7, Asa0, r8,  9, r3, Abe1, r11,  1, r4, Agi0, r2,  3, r5, Ako1, r9, 13, r6, Amu0, r12,  4
-	ldr			r8, [sp, #mDa1]
-	KeccakThetaRhoPiChi r4, Aga1, r8, 18, r5, Ake1, r11,  5, r6, Ami0, r2,  7, r7, Aso1, r9, 28, r3, Abu0, r12, 13
-	KeccakThetaRhoPiChi r6, Ama1, r8, 21, r7, Ase0, r10,  1, r3, Abi0, r2, 31, r4, Ago1, r9, 28, r5, Aku1, lr, 20
-	ldr			r9, [sp, #mDo0]
-	KeccakThetaRhoPiChiIota  Aba1, r8,          Age1, r11, 22,      Aki0, r2, 21,      Amo0, r9, 10,      Asu1, lr,  7, 4, 0
-	.endm
-
-.macro	KeccakRound1
-
-	xor5        r3,  Asu0, Agu0, Amu0, Abu1, Aku1
-	xor5        r7, Age1, Ame0, Abe0, Ake1, Ase1
-	xorrol      r6, r3, r7
-	str			r6, [sp, #mDa0]
-	xor5        r6,  Asu1, Agu1, Amu1, Abu0, Aku0
-	xor5        lr, Age0, Ame1, Abe1, Ake0, Ase0
-	eors        r8, r6, lr
-	str			r8, [sp, #mDa1]
-
-	xor5        r5,  Aki1, Asi1, Agi0, Ami1, Abi0
-	xorrol      r9, r5, r6
-	str			r9, [sp, #mDo0]
-	xor5        r4,  Aki0, Asi0, Agi1, Ami0, Abi1
-	eors		r3, r3, r4
-	str			r3, [sp, #mDo1]
-
-	xor5        r3,  Aba0, Aka1, Asa0, Aga0, Ama1
-	xorrol      r10, r3, r4
-	xor5        r6,  Aba1, Aka0, Asa1, Aga1, Ama0
-	eors        r11, r6, r5
-
-	xor5        r4,  Amo0, Abo1, Ako0, Aso1, Ago0
-	xorrol      r5, lr, r4
-	str			r5, [sp, #mDi0]
-	xor5        r5,  Amo1, Abo0, Ako1, Aso0, Ago1
-	eors        r2, r7, r5
-
-	xorrol      r12, r5, r6
-	eors        lr, r4, r3
-
-	KeccakThetaRhoPiChi r5, Asa1, r8,  2, r6, Ake1, r11, 23, r7, Abi1, r2, 31, r3, Amo1, r9, 14, r4, Agu0, r12, 10
-	KeccakThetaRhoPiChi r7, Ama0, r8,  9, r3, Age0, r10,  0, r4, Asi0, r2,  3, r5, Ako1, r9, 12, r6, Abu0, lr,  4
-	ldr			r8, [sp, #mDa0]
-	KeccakThetaRhoPiChi r4, Aka1, r8, 18, r5, Abe1, r10,  5, r6, Ami0, r2,  8, r7, Ago1, r9, 28, r3, Asu1, lr, 14
-	KeccakThetaRhoPiChi r6, Aga0, r8, 20, r7, Ase1, r11,  1, r3, Aki0, r2, 31, r4, Abo0, r9, 27, r5, Amu0, r12, 19
-	ldr			r9, [sp, #mDo1]
-	KeccakThetaRhoPiChiIota  Aba0, r8,          Ame1, r10, 22,      Agi1, r2, 22,      Aso1, r9, 11,      Aku1, r12,  7, 8, 0
-
-	ldr			r2, [sp, #mDi0]
-	KeccakThetaRhoPiChi r5, Asa0, r8,  1, r6, Ake0, r10, 22, r7, Abi0, r2, 30, r3, Amo0, r9, 14, r4, Agu1, lr, 10
-	KeccakThetaRhoPiChi r7, Ama1, r8,  9, r3, Age1, r11,  1, r4, Asi1, r2,  3, r5, Ako0, r9, 13, r6, Abu1, r12,  4
-	ldr			r8, [sp, #mDa1]
-	KeccakThetaRhoPiChi r4, Aka0, r8, 18, r5, Abe0, r11,  5, r6, Ami1, r2,  7, r7, Ago0, r9, 28, r3, Asu0, r12, 13
-	KeccakThetaRhoPiChi r6, Aga1, r8, 21, r7, Ase0, r10,  1, r3, Aki1, r2, 31, r4, Abo1, r9, 28, r5, Amu1, lr, 20
-	ldr			r9, [sp, #mDo0]
-	KeccakThetaRhoPiChiIota  Aba1, r8,          Ame0, r11, 22,      Agi0, r2, 21,      Aso0, r9, 10,      Aku0, lr,  7, 12, 0
-	.endm
-
-.macro	KeccakRound2
-
-	xor5        r3, Aku1, Agu0, Abu1, Asu1, Amu1
-	xor5        r7, Ame0, Ake0, Age0, Abe0, Ase1
-	xorrol      r6, r3, r7
-	str			r6, [sp, #mDa0]
-	xor5        r6,  Aku0, Agu1, Abu0, Asu0, Amu0
-	xor5        lr, Ame1, Ake1, Age1, Abe1, Ase0
-	eors        r8, r6, lr
-	str			r8, [sp, #mDa1]
-
-	xor5        r5,  Agi1, Abi1, Asi1, Ami0, Aki1
-	xorrol      r9, r5, r6
-	str			r9, [sp, #mDo0]
-	xor5        r4,  Agi0, Abi0, Asi0, Ami1, Aki0
-	eors		r3, r3, r4
-	str			r3, [sp, #mDo1]
-
-	xor5        r3,  Aba0, Asa1, Ama1, Aka1, Aga1
-	xorrol      r10, r3, r4
-	xor5        r6,  Aba1, Asa0, Ama0, Aka0, Aga0
-	eors        r11, r6, r5
-
-	xor5        r4,  Aso0, Amo0, Ako1, Ago0, Abo0
-	xorrol      r5, lr, r4
-	str			r5, [sp, #mDi0]
-	xor5        r5,  Aso1, Amo1, Ako0, Ago1, Abo1
-	eors        r2, r7, r5
-
-	xorrol      r12, r5, r6
-	eors        lr, r4, r3
-
-	KeccakThetaRhoPiChi r5, Ama0, r8,  2, r6, Abe0, r11, 23, r7, Aki0, r2, 31, r3, Aso1, r9, 14, r4, Agu0, r12, 10
-	KeccakThetaRhoPiChi r7, Aga0, r8,  9, r3, Ame1, r10,  0, r4, Abi0, r2,  3, r5, Ako0, r9, 12, r6, Asu0, lr,  4
-	ldr			r8, [sp, #mDa0]
-	KeccakThetaRhoPiChi r4, Asa1, r8, 18, r5, Age1, r10,  5, r6, Ami1, r2,  8, r7, Abo1, r9, 28, r3, Aku0, lr, 14
-	KeccakThetaRhoPiChi r6, Aka1, r8, 20, r7, Ase1, r11,  1, r3, Agi0, r2, 31, r4, Amo1, r9, 27, r5, Abu1, r12, 19
-	ldr			r9, [sp, #mDo1]
-	KeccakThetaRhoPiChiIota  Aba0, r8,          Ake1, r10, 22,      Asi0, r2, 22,      Ago0, r9, 11,      Amu1, r12,  7, 16, 0
-
-	ldr			r2, [sp, #mDi0]
-	KeccakThetaRhoPiChi r5, Ama1, r8,  1, r6, Abe1, r10, 22, r7, Aki1, r2, 30, r3, Aso0, r9, 14, r4, Agu1, lr, 10
-	KeccakThetaRhoPiChi r7, Aga1, r8,  9, r3, Ame0, r11,  1, r4, Abi1, r2,  3, r5, Ako1, r9, 13, r6, Asu1, r12,  4
-	ldr			r8, [sp, #mDa1]
-	KeccakThetaRhoPiChi r4, Asa0, r8, 18, r5, Age0, r11,  5, r6, Ami0, r2,  7, r7, Abo0, r9, 28, r3, Aku1, r12, 13
-	KeccakThetaRhoPiChi r6, Aka0, r8, 21, r7, Ase0, r10,  1, r3, Agi1, r2, 31, r4, Amo0, r9, 28, r5, Abu0, lr, 20
-	ldr			r9, [sp, #mDo0]
-	KeccakThetaRhoPiChiIota  Aba1, r8,          Ake0, r11, 22,      Asi1, r2, 21,      Ago1, r9, 10,      Amu0, lr,  7, 20, 0
-	.endm
-
-.macro	KeccakRound3
-
-	xor5        r3,  Amu1, Agu0, Asu1, Aku0, Abu0
-	xor5        r7, Ake0, Abe1, Ame1, Age0, Ase1
-	xorrol      r6, r3, r7
-	str			r6, [sp, #mDa0]
-	xor5        r6,  Amu0, Agu1, Asu0, Aku1, Abu1
-	xor5        lr, Ake1, Abe0, Ame0, Age1, Ase0
-	eors        r8, r6, lr
-	str			r8, [sp, #mDa1]
-
-	xor5        r5,  Asi0, Aki0, Abi1, Ami1, Agi1
-	xorrol      r9, r5, r6
-	str			r9, [sp, #mDo0]
-	xor5        r4,  Asi1, Aki1, Abi0, Ami0, Agi0
-	eors		r3, r3, r4
-	str			r3, [sp, #mDo1]
-
-	xor5        r3,  Aba0, Ama0, Aga1, Asa1, Aka0
-	xorrol      r10, r3, r4
-	xor5        r6,  Aba1, Ama1, Aga0, Asa0, Aka1
-	eors        r11, r6, r5
-
-	xor5        r4,  Ago1, Aso0, Ako0, Abo0, Amo1
-	xorrol      r5, lr, r4
-	str			r5, [sp, #mDi0]
-	xor5        r5,  Ago0, Aso1, Ako1, Abo1, Amo0
-	eors        r2, r7, r5
-
-	xorrol      r12, r5, r6
-	eors        lr, r4, r3
-
-	KeccakThetaRhoPiChi r5, Aga0, r8,  2, r6, Age0, r11, 23, r7, Agi0, r2, 31, r3, Ago0, r9, 14, r4, Agu0, r12, 10
-	KeccakThetaRhoPiChi r7, Aka1, r8,  9, r3, Ake1, r10,  0, r4, Aki1, r2,  3, r5, Ako1, r9, 12, r6, Aku1, lr,  4
-	ldr			r8, [sp, #mDa0]
-	KeccakThetaRhoPiChi r4, Ama0, r8, 18, r5, Ame0, r10,  5, r6, Ami0, r2,  8, r7, Amo0, r9, 28, r3, Amu0, lr, 14
-	KeccakThetaRhoPiChi r6, Asa1, r8, 20, r7, Ase1, r11,  1, r3, Asi1, r2, 31, r4, Aso1, r9, 27, r5, Asu1, r12, 19
-	ldr			r9, [sp, #mDo1]
-	KeccakThetaRhoPiChiIota  Aba0, r8,          Abe0, r10, 22,      Abi0, r2, 22,      Abo0, r9, 11,      Abu0, r12,  7, 24, 0
-
-	ldr			r2, [sp, #mDi0]
-	KeccakThetaRhoPiChi r5, Aga1, r8,  1, r6, Age1, r10, 22, r7, Agi1, r2, 30, r3, Ago1, r9, 14, r4, Agu1, lr, 10
-	KeccakThetaRhoPiChi r7, Aka0, r8,  9, r3, Ake0, r11,  1, r4, Aki0, r2,  3, r5, Ako0, r9, 13, r6, Aku0, r12,  4
-	ldr			r8, [sp, #mDa1]
-	KeccakThetaRhoPiChi r4, Ama1, r8, 18, r5, Ame1, r11,  5, r6, Ami1, r2,  7, r7, Amo1, r9, 28, r3, Amu1, r12, 13
-	KeccakThetaRhoPiChi r6, Asa0, r8, 21, r7, Ase0, r10,  1, r3, Asi0, r2, 31, r4, Aso0, r9, 28, r5, Asu0, lr, 20
-	ldr			r9, [sp, #mDo0]
-	KeccakThetaRhoPiChiIota  Aba1, r8,          Abe1, r11, 22,      Abi1, r2, 21,      Abo1, r9, 10,      Abu1, lr,  7, 28, 1
-	.endm
-
-
-@----------------------------------------------------------------------------
-@
-@ void KeccakF1600_Initialize( void )
-@
-.align 8
-.global   KeccakF1600_Initialize
-KeccakF1600_Initialize:
-	bx		lr
-
-
-
-@----------------------------------------------------------------------------
-@
-@ void KeccakF1600_StateXORBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length)
-@
-.align 8
-.global   KeccakF1600_StateXORBytes
-KeccakF1600_StateXORBytes:
-	cbz		r3, KeccakF1600_StateXORBytes_Exit1
-	push	{r4 - r8, lr}							@ then
-	bic		r4, r2, #7								@ offset &= ~7
-	adds	r0, r0, r4								@ add whole lane offset to state pointer
-	ands	r2, r2, #7								@ offset &= 7 (part not lane aligned)
-	beq		KeccakF1600_StateXORBytes_CheckLanes	@ .if offset != 0
-	movs	r4, r3									@ then, do remaining bytes in first lane
-	rsb		r5, r2, #8								@ max size in lane = 8 - offset
-	cmp		r4, r5
-	ble		KeccakF1600_StateXORBytes_BytesAlign
-	movs	r4, r5
-KeccakF1600_StateXORBytes_BytesAlign:
-	sub		r8, r3, r4								@ size left
-	movs	r3, r4
-	bl		__KeccakF1600_StateXORBytesInLane
-	mov		r3, r8
-KeccakF1600_StateXORBytes_CheckLanes:
-	lsrs	r2, r3, #3								@ .if length >= 8
-	beq		KeccakF1600_StateXORBytes_Bytes
-	mov		r8, r3
-	bl		__KeccakF1600_StateXORLanes
-	and		r3, r8, #7
-KeccakF1600_StateXORBytes_Bytes:
-	cbz		r3, KeccakF1600_StateXORBytes_Exit
-	movs	r2, #0
-	bl		__KeccakF1600_StateXORBytesInLane
-KeccakF1600_StateXORBytes_Exit:
-	pop		{r4 - r8, pc}
-KeccakF1600_StateXORBytes_Exit1:
-	bx		lr
-
-
-@----------------------------------------------------------------------------
-@
-@ __KeccakF1600_StateXORLanes
-@
-@ Input:
-@  r0 state pointer
-@  r1 data pointer
-@  r2 laneCount
-@
-@ Output:
-@  r0 state pointer next lane
-@  r1 data pointer next byte to input
-@
-@ Changed: r2-r7
-@
-.align 8
-__KeccakF1600_StateXORLanes:
-__KeccakF1600_StateXORLanes_LoopAligned:
-	ldr		r4, [r1], #4
-	ldr		r5, [r1], #4
-	ldrd    r6, r7, [r0]
-	toBitInterleaving	r4, r5, r6, r7, r3, 0
-	strd	r6, r7, [r0], #8
-	subs	r2, r2, #1
-	bne		__KeccakF1600_StateXORLanes_LoopAligned
-	bx		lr
-
-
-@----------------------------------------------------------------------------
-@
-@ __KeccakF1600_StateXORBytesInLane
-@
-@ Input:
-@  r0 state pointer
-@  r1 data pointer
-@  r2 offset in lane
-@  r3 length
-@
-@ Output:
-@  r0 state pointer next lane
-@  r1 data pointer next byte to input
-@
-@  Changed: r2-r7
-@
-.align 8
-__KeccakF1600_StateXORBytesInLane:
-	movs	r4, #0
-	movs	r5, #0
-	push	{ r4 - r5 }
-	add		r2, r2, sp
-__KeccakF1600_StateXORBytesInLane_Loop:
-	ldrb	r5, [r1], #1
-	strb	r5, [r2], #1
-	subs	r3, r3, #1
-	bne		__KeccakF1600_StateXORBytesInLane_Loop
-	pop		{ r4 - r5 }
-	ldrd    r6, r7, [r0]
-	toBitInterleaving	r4, r5, r6, r7, r3, 0
-	strd	r6, r7, [r0], #8
-	bx		lr
-
-
-
-
-@----------------------------------------------------------------------------
-@
-@ void KeccakF1600_StateExtractBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length)
-@
-.align 8
-.global   KeccakF1600_StateExtractBytes
-KeccakF1600_StateExtractBytes:
-	cbz		r3, KeccakF1600_StateExtractBytes_Exit1	@ .if length != 0
-	push	{r4 - r8, lr}							@ then
-	bic		r4, r2, #7								@ offset &= ~7
-	adds	r0, r0, r4								@ add whole lane offset to state pointer
-	ands	r2, r2, #7								@ offset &= 7 (part not lane aligned)
-	beq		KeccakF1600_StateExtractBytes_CheckLanes	@ .if offset != 0
-	movs	r4, r3									@ then, do remaining bytes in first lane
-	rsb		r5, r2, #8								@ max size in lane = 8 - offset
-	cmp		r4, r5
-	ble		KeccakF1600_StateExtractBytes_BytesAlign
-	movs	r4, r5
-KeccakF1600_StateExtractBytes_BytesAlign:
-	sub		r8, r3, r4								@ size left
-	movs	r3, r4
-	bl		__KeccakF1600_StateExtractBytesInLane
-	mov		r3, r8
-KeccakF1600_StateExtractBytes_CheckLanes:
-	lsrs	r2, r3, #3								@ .if length >= 8
-	beq		KeccakF1600_StateExtractBytes_Bytes
-	mov		r8, r3
-	bl		__KeccakF1600_StateExtractLanes
-	and		r3, r8, #7
-KeccakF1600_StateExtractBytes_Bytes:
-	cbz		r3, KeccakF1600_StateExtractBytes_Exit
-	movs	r2, #0
-	bl		__KeccakF1600_StateExtractBytesInLane
-KeccakF1600_StateExtractBytes_Exit:
-	pop		{r4 - r8, pc}
-KeccakF1600_StateExtractBytes_Exit1:
-	bx		lr
-
-
-@----------------------------------------------------------------------------
-@
-@ __KeccakF1600_StateExtractLanes
-@
-@ Input:
-@  r0 state pointer
-@  r1 data pointer
-@  r2 laneCount
-@
-@ Output:
-@  r0 state pointer next lane
-@  r1 data pointer next byte to input
-@
-@ Changed: r2-r5
-@
-.align 8
-__KeccakF1600_StateExtractLanes:
-__KeccakF1600_StateExtractLanes_LoopAligned:
-	ldrd	r4, r5, [r0], #8
-	fromBitInterleaving	r4, r5, r3
-	str		r4, [r1], #4
-	subs	r2, r2, #1
-	str		r5, [r1], #4
-	bne		__KeccakF1600_StateExtractLanes_LoopAligned
-	bx		lr
-
-
-@----------------------------------------------------------------------------
-@
-@ __KeccakF1600_StateExtractBytesInLane
-@
-@ Input:
-@  r0 state pointer
-@  r1 data pointer
-@  r2 offset in lane
-@  r3 length
-@
-@ Output:
-@  r0 state pointer next lane
-@  r1 data pointer next byte to input
-@
-@  Changed: r2-r6
-@
-.align 8
-__KeccakF1600_StateExtractBytesInLane:
-	ldrd	r4, r5, [r0], #8
-	fromBitInterleaving	r4, r5, r6
-	push	{r4, r5}
-	add		r2, sp, r2
-__KeccakF1600_StateExtractBytesInLane_Loop:
-	ldrb	r4, [r2], #1
-	subs	r3, r3, #1
-	strb	r4, [r1], #1
-	bne		__KeccakF1600_StateExtractBytesInLane_Loop
-	add		sp, #8
-	bx		lr
-
-
-
-.align 8
-KeccakF1600_StatePermute_RoundConstantsWithTerminator:
-	@		0			1
-		.long 		0x00000001,	0x00000000
-		.long 		0x00000000,	0x00000089
-		.long 		0x00000000,	0x8000008b
-		.long 		0x00000000,	0x80008080
-
-		.long 		0x00000001,	0x0000008b
-		.long 		0x00000001,	0x00008000
-		.long 		0x00000001,	0x80008088
-		.long 		0x00000001,	0x80000082
-
-		.long 		0x00000000,	0x0000000b
-		.long 		0x00000000,	0x0000000a
-		.long 		0x00000001,	0x00008082
-		.long 		0x00000000,	0x00008003
-
-		.long 		0x00000001,	0x0000808b
-		.long 		0x00000001,	0x8000000b
-		.long 		0x00000001,	0x8000008a
-		.long 		0x00000001,	0x80000081
-
-		.long 		0x00000000,	0x80000081
-		.long 		0x00000000,	0x80000008
-		.long 		0x00000000,	0x00000083
-		.long 		0x00000000,	0x80008003
-
-		.long 		0x00000001,	0x80008088
-		.long 		0x00000000,	0x80000088
-		.long 		0x00000001,	0x00008000
-		.long 		0x00000000,	0x80008082
-
-		.long 		0x000000FF	@terminator
-
-@----------------------------------------------------------------------------
-@
-@ void KeccakF1600_StatePermute( void *state )
-@
-.align 8
-.global   KeccakF1600_StatePermute
-KeccakF1600_StatePermute:
-	adr		r1, KeccakF1600_StatePermute_RoundConstantsWithTerminator
-	push	{ r4 - r12, lr }
-	sub		sp, #mSize
-	str		r1, [sp, #mRC]
-KeccakF1600_StatePermute_RoundLoop:
-	KeccakRound0
-	KeccakRound1
-	KeccakRound2
-	KeccakRound3
-	bne		KeccakF1600_StatePermute_RoundLoop
-	add		sp, #mSize
-	pop		{ r4 - r12, pc }
-
diff --git a/src/pqm4common/keccakf1600.h b/src/pqm4common/keccakf1600.h
deleted file mode 100644
index e017bf1..0000000
--- a/src/pqm4common/keccakf1600.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef KECCAKF1600_H
-#define KECCAKF1600_H
-
-#include <stdint.h>
-
-void KeccakF1600_StateExtractBytes(uint64_t *state, unsigned char *data, unsigned int offset, unsigned int length);
-void KeccakF1600_StateXORBytes(uint64_t *state, const unsigned char *data, unsigned int offset, unsigned int length);
-void KeccakF1600_StatePermute(uint64_t * state);
-
-#endif
\ No newline at end of file
diff --git a/src/rng.c b/src/rng.c
index 79f455c..68a8a15 100644
--- a/src/rng.c
+++ b/src/rng.c
@@ -1,5 +1,4 @@
 #include "rng.h"
-#include <string.h>
 
 void RNG_Enable(void) {
     RCC_AHB2PeriphClockCmd(RCC_AHB2Periph_RNG, ENABLE);
@@ -10,31 +9,3 @@ void RNG_Enable(void) {
 void RNG_Disable(void) {
     RCC_AHB2PeriphClockCmd(RCC_AHB2Periph_RNG, DISABLE);
 }
-
-static uint32_t rng_get_random_internal() {
-    while (RNG_GetFlagStatus(RNG_FLAG_DRDY) == RESET){}
-    return RNG_GetRandomNumber();
-}
-
-static uint32_t rng_get_random_blocking() {
-    uint32_t bytes;
-    RNG_Enable();
-    bytes = rng_get_random_internal();
-    RNG_Disable();
-}
-
-void randombytes(char* data, size_t size) {
-    uint32_t randomness;
-    RNG_Enable();
-    while (size >= sizeof(uint32_t)) {
-        randomness = rng_get_random_internal();
-        memcpy(data, &randomness, sizeof(uint32_t));
-        size -= sizeof(uint32_t);
-        data += sizeof(uint32_t);
-    }
-    if (size > 0) {
-        randomness = rng_get_random_internal();
-        memcpy(data, &randomness, size);
-    }
-    RNG_Disable();
-}
diff --git a/src/rng.h b/src/rng.h
index beae3be..1c8d5e9 100644
--- a/src/rng.h
+++ b/src/rng.h
@@ -7,6 +7,5 @@
 
 void RNG_Enable(void);
 void RNG_Disable(void);
-void randombytes(char* data, size_t size);
 
 #endif //PINATABOARD_RNG_H