将核碱基表示从ASCII转换为UCSC.2bit-编程知识-白鹭情

明确的 DNA 序列仅由核碱基腺嘌呤 (A)、胞嘧啶 (C)、鸟嘌呤 (G)、胸腺嘧啶 (T) 组成。对于人类消费，碱基可以用相应char的大写或小写字母表示：A, C, G, T, 或a, c, g, t。然而，当需要存盘长序列时，这种表示是低效的。由于只需要存盘四个符号，因此可以为每个符号分配一个 2 位代码。UCSC指定的常用.2bit格式正是这样做的，使用以下编码： T = , C = , A = , G = 。0b000b010b100b11

下面的 C 代码显示了为清楚起见而撰写的参考实作。各种将基因组序列转换为char序列的开源软件通常使用一个 256 条目的查找表，这些查找表由每个char序列索引。这也与char. 然而，即使访问的是片上快取，存储器访问也非常昂贵，并且通用表查找很难 SIMDize。因此，如果可以通过简单的整数算术来完成转换，那将是有利的。鉴于 ASCII 是主要的char编码，人们可以限制这一点。

将作为 ASCII 字符给出的核碱基转换为它们的.2bit表示的有效计算方法是什么？

/* Convert nucleobases A, C, G, T represented as either uppercase or lowercase
   ASCII characters into UCSC .2bit-presentation. Passing any values other than
   those corresponding to 'A', 'C', 'G', 'T', 'a', 'c', 'g', 't' results in an
   indeterminate response.
*/
unsigned int ascii_to_2bit (unsigned int a)
{
    const unsigned int UCSC_2BIT_A = 2;
    const unsigned int UCSC_2BIT_C = 1;
    const unsigned int UCSC_2BIT_G = 3;
    const unsigned int UCSC_2BIT_T = 0;

    switch (a) {
    case 'A':
    case 'a':
        return UCSC_2BIT_A;
        break;
    case 'C':
    case 'c':
        return UCSC_2BIT_C;
        break;
    case 'G':
    case 'g':
        return UCSC_2BIT_G;
        break;
    case 'T':
    case 't':
    default:
        return UCSC_2BIT_T;
        break;
    }
}

uj5u.com热心网友回复：

一种选择如下：

unsigned int ascii_to_2bit (unsigned int a)
{
    return ((0xad - a) & 6) >> 1;
}

这样做的好处是它只需要 8 位，不会溢位，并且不包含非常量移位，因此即使没有特定的 SIMD 指令，它也可以立即用于并行化，例如在 64 位中输入 8 个字符单词

unsigned int ascii_to_2bit_8bytes (uint64_t a)
{
    return ((0xadadadadadadadad - a) & 0x0606060606060606) >> 1;
}

将两个输出位留在每个字节的底部。

uj5u.com热心网友回复：

如果仔细观察核碱基的 ASCII 字符的二进制代码，很明显第 1 位和第 2 位提供了唯一的两位代码：A= 0b01000001-> 0b00、C= 0b01000011-> 0b01、G= 0b01000111-> 0b11、T= 0b01010100-> 0b10。与小写 ASCII 字符类似，它们仅在第 5 位上有所不同。不幸的是，这个简单的映射与.2bit-encoding不完全匹配，因为 A 和 T 的代码被交换了。解决此问题的一种方法是使用存盘在变量中的简单四项置换表，可能在优化后分配给暂存器（“暂存器内查找表”）：

unsigned int ascii_to_2bit_perm (unsigned int a)
{
    unsigned int perm = (2 << 0) | (1 << 2) | (0 << 4) | (3 << 6);
    return (perm >> (a & 6)) & 3;
}

另一种方法通过简单的位操作来纠正生成的“幼稚”代码，即通过简单的位操作实时提取位 1 和 2，通过观察位 1 为 0A和T，但 1 为Cand G。因此，我们可以通过将输入的第 1 位的倒数与初步代码的第 1 位进行异或运算来交换 A 和 T 的编码：

unsigned int ascii_to_2bit_twiddle (unsigned int a)
{
    return ((a >> 1) & 3) ^ (~a & 2);
}

这个版本在具有快速位域提取指令的处理器和没有桶形移位器的低端处理器上是有利的，因为只需要右移一位。在乱序处理器上，这种方法提供了比置换表更多的指令级并行性。适应 SIMD 实作似乎也更容易，因为在所有字节通道中使用相同的移位计数。

在我专注于相关 ASCII 字符的二进制编码之前，我已经研究过使用简单的数学计算。对小的乘数和除数进行简单的蛮力搜索得到：

unsigned int ascii_to_2bit_math (unsigned int a)
{
    return ((18 * (a & 31)) % 41) & 3;
}

乘法器18对没有快速整数乘法器的处理器很友好。现代编译器可以有效地处理具有编译时常数除数的模计算，并且不需要除法。即便如此，我注意到即使是最好的可用编译器也难以利用非常有限的输入和输出范围，所以我不得不手动按摩它以简化代码：

unsigned int ascii_to_2bit_math (unsigned int a)
{
    unsigned int t = 18 * (a & 31);
    return (t - ((t * 25) >> 10)) & 3;
}

即使在这种形式下并且假设单周期乘法的可用性，这似乎与之前的两种方法相比通常没有竞争力，因为它产生更多的指令和更长的依赖链。然而，在 64 位平台上，整个计算可以用一个 64 位、32 项的查找表代替，如果这个 64 位表可以有效地放入暂存器中，则可以提供具有竞争力的性能，x86 就是这种情况。 64，它作为立即数加载。

unsigned int ascii_to_2bit_tab (unsigned int a)
{
    uint64_t tab = ((0ULL <<  0) | (2ULL <<  2) | (0ULL <<  4) | (1ULL <<  6) |
                    (3ULL <<  8) | (0ULL << 10) | (2ULL << 12) | (3ULL << 14) |
                    (1ULL << 16) | (3ULL << 18) | (0ULL << 20) | (2ULL << 22) |
                    (3ULL << 24) | (1ULL << 26) | (2ULL << 28) | (0ULL << 30) |
                    (1ULL << 32) | (3ULL << 34) | (1ULL << 36) | (2ULL << 38) |
                    (0ULL << 40) | (1ULL << 42) | (3ULL << 44) | (0ULL << 46) |
                    (2ULL << 48) | (0ULL << 50) | (1ULL << 52) | (3ULL << 54) |
                    (0ULL << 56) | (2ULL << 58) | (3ULL << 60) | (1ULL << 62));
    return (tab >> (2 * (a & 31))) & 3;
}

我正在附加我的测验框架以供参考：

#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>

#define ORIGINAL_MATH (1)

unsigned int ascii_to_2bit_perm (unsigned int a)
{
    unsigned int perm = (2 << 0) | (1 << 2) | (0 << 4) | (3 << 6);
    return (perm >> (a & 6)) & 3;
}

unsigned int ascii_to_2bit_twiddle (unsigned int a)
{
    return ((a >> 1) & 3) ^ (~a & 2);
}

unsigned int ascii_to_2bit_math (unsigned int a)
{
#if ORIGINAL_MATH
    return ((18 * (a & 31)) % 41) & 3;
#else // ORIGINAL_MATH
    unsigned int t = 18 * (a & 31);
    return (t - ((t * 25) >> 10)) & 3;
#endif // ORIGINAL_MATH
}

unsigned int ascii_to_2bit_tab (unsigned int a)
{
    uint64_t tab = ((0ULL <<  0) | (2ULL <<  2) | (0ULL <<  4) | (1ULL <<  6) |
                    (3ULL <<  8) | (0ULL << 10) | (2ULL << 12) | (3ULL << 14) |
                    (1ULL << 16) | (3ULL << 18) | (0ULL << 20) | (2ULL << 22) |
                    (3ULL << 24) | (1ULL << 26) | (2ULL << 28) | (0ULL << 30) |
                    (1ULL << 32) | (3ULL << 34) | (1ULL << 36) | (2ULL << 38) |
                    (0ULL << 40) | (1ULL << 42) | (3ULL << 44) | (0ULL << 46) |
                    (2ULL << 48) | (0ULL << 50) | (1ULL << 52) | (3ULL << 54) |
                    (0ULL << 56) | (2ULL << 58) | (3ULL << 60) | (1ULL << 62));
    return (tab >> (2 * (a & 31))) & 3;
}

/* Convert nucleobases A, C, G, T represented as either uppercase or lowercase
   ASCII characters into UCSC .2bit-presentation. Passing any values other than
   those corresponding to 'A', 'C', 'G', 'T', 'a', 'c', 'g', 't' results in an
   inderminate response.
*/
unsigned int ascii_to_2bit (unsigned int a)
{
    const unsigned int UCSC_2BIT_A = 2;
    const unsigned int UCSC_2BIT_C = 1;
    const unsigned int UCSC_2BIT_G = 3;
    const unsigned int UCSC_2BIT_T = 0;

    switch (a) {
    case 'A':
    case 'a':
        return UCSC_2BIT_A;
        break;
    case 'C':
    case 'c':
        return UCSC_2BIT_C;
        break;
    case 'G':
    case 'g':
        return UCSC_2BIT_G;
        break;
    case 'T':
    case 't':
    default:
        return UCSC_2BIT_T;
        break;
    }
}

int main (void)
{
    char nucleobase[8] = {'A', 'C', 'G', 'T', 'a', 'c', 'g', 't'};
    printf ("Testing permutation variant:\n");
    for (unsigned int i = 0; i < sizeof nucleobase; i  ) {
        unsigned int ref = ascii_to_2bit (nucleobase[i]);
        unsigned int res = ascii_to_2bit_perm (nucleobase[i]);
        printf ("i=%2u %c res=%u ref=%u %c\n", 
                i, nucleobase[i], res, ref, (res == ref) ? 'p' : 'F');
    }
    printf ("Testing bit-twiddling variant:\n");
    for (unsigned int i = 0; i < sizeof nucleobase; i  ) {
        unsigned int ref = ascii_to_2bit (nucleobase[i]);
        unsigned int res = ascii_to_2bit_twiddle (nucleobase[i]);
        printf ("i=%2u %c res=%u ref=%u %c\n", 
                i, nucleobase[i], res, ref, (res == ref) ? 'p' : 'F');
    }
    printf ("Testing math-based variant:\n");
    for (unsigned int i = 0; i < sizeof nucleobase; i  ) {
        unsigned int ref = ascii_to_2bit (nucleobase[i]);
        unsigned int res = ascii_to_2bit_math (nucleobase[i]);
        printf ("i=%2u %c res=%u ref=%u %c\n", 
                i, nucleobase[i], res, ref, (res == ref) ? 'p' : 'F');
    }
    printf ("Testing table-based variant:\n");
    for (unsigned int i = 0; i < sizeof nucleobase; i  ) {
        unsigned int ref = ascii_to_2bit (nucleobase[i]);
        unsigned int res = ascii_to_2bit_tab (nucleobase[i]);
        printf ("i=%2u %c res=%u ref=%u %c\n", 
                i, nucleobase[i], res, ref, (res == ref) ? 'p' : 'F');
    }
    return EXIT_SUCCESS;
}

uj5u.com热心网友回复：

以下例程将使用填充了uint32_t您宣告的字符的 ASCII 字符串的内容填充值阵列，并将保存状态以便能够将第二个、第三个等数量的字符串附加到阵列中。使用它的方式通过一个main()例程来说明，该例程从命令列获取字符串自变量并将它们传递给TOTAL元素阵列。该例程的自变量在其上方的注释中进行了描述。

#include <ctype.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define UCSC_2BIT_T  (0)
#define UCSC_2BIT_C  (1)
#define UCSC_2BIT_A  (2)
#define UCSC_2BIT_G  (3)

#define PAIRS_PER_INTEGER   16

/**
 * Converts a string of nucleobases in ASCII form into an array
 * of 32bit integers in 2bitform.  As it should be possible to
 * continue the array of integers, a status reference is
 * maintained in order to determine determine where in the
 * integer to start putting the two bit sequences.  For this,
 * a (*state) variable is maintained (initialize it to 0) to
 * remember where to start putting bitpairs in the array.
 *
 * @param state_ref reference to the state variable to be maintained
 *               with the position on which to put the base in the
 *               array entry.
 * @param dna    string with the ASCII chain of bases.
 * @param twobit array reference to start filling.
 * @param sz     size of the array ***in array entries***.
 * @return the number of array elements written.  This allows to
 *         use a pointer and advance it at each function call
 *         with the number of entries consumed on each call.
 */
ssize_t
dna2twobit(
    int *state_ref,
    char *dna,
    uint32_t twobit[],
    size_t sz)
{
    /* local copy so we only change the state in case of
     * successful execution */
    int state = 30 - *state_ref;
    if (state == 30) *twobit = 0;
    ssize_t total_nb = 0; /* total number of array elements consumed */
    while (*dna && sz) {
        char base = toupper(*dna  );
        uint32_t tb;
        switch (base) {
        case 'A': tb = UCSC_2BIT_A; break;
        case 'C': tb = UCSC_2BIT_C; break;
        case 'T': tb = UCSC_2BIT_T; break;
        case 'G': tb = UCSC_2BIT_G; break;
        default: return -1;
        }
        *twobit |= tb << state;
        state -= 2;
        if (state < 0) {
            --sz;   twobit;
            state  = 32;
            *twobit = 0;
            total_nb  ;
        }
    }
    *state_ref = 30 - state;
    return total_nb;
}

这个函式可以单独链接到你想要的任何程序中，但是我提供了一个main()代码来说明这个函式的使用。您可以使用命令列自变量中以 ASCII 编码的实际链来呼叫程序。该程序将在前一个之后对它们进行编码，以演示多重转换（16 个碱基适合一个 32 位整数，如您在问题中发布的格式的定义所述，因此如果没有编码 16 的倍数，状态反映了最后一个已经覆写了多少位。

代码继续下面的主要功能：


#define TOTAL 16

int main(int argc, char **argv)
{
    int state = 0, i;
    uint32_t twobit_array[TOTAL], *p = twobit_array;
    size_t twobit_cap = TOTAL;

    for (i = 1; i < argc;   i) {
        printf("Adding the following chain: [%s]\n", argv[i]);
        ssize_t n = dna2twobit(
                        &state,
                        argv[i],
                        p,
                        twobit_cap);
        if (n < 0) {
            fprintf(stderr,
                    "argument invalid: %s\n"
                    "continuing to next\n",
                    argv[i]);
            continue;
        }
        twobit_cap -= n;
        p  = n;
    }
    if (!state) --p;
    uint32_t *q = twobit_array;
    size_t off = 0;
    for (int j = 0; q <= p; j  ) {
        char *sep = "";
        printf("	zd: ", off);
        for (int k = 0; k < 4 & q <= p; k  ) {
            printf("%sx", sep, *q);
            sep = "-";
            q  ;
            off  = 16;
        }
        printf("\n");
    }
    return 0;
}