Fastest unique finder

Question

Given 4096 16-bit integers, only four of which are unique and others appeared twice (so 2050 different integers exist). Find the four uniques.

To time precisely and limit RAM usage, your program will be run on a 12MHz 8051(with 128B RAM). I'll test with a random case generated by some code to test, but we aim at the worst case, so anyone can provide other test cases.

You can answer in C or ASM, but C will likely give some extra disadvantage. The input array is placed on ROM 0xE000-0xFFFF, and output is placed on extra write-only RAM 0x00-0x07. When you finish outputting, clear P3.5.

A C code that solves the problem with bad score, to better understand the I/O way:

#include <reg51.h>
sbit P3_5 = P3^5;
unsigned int code input[4096] _at_ 0xE000;
unsigned int pdata output[4] _at_ 0x00;
void main() {
    unsigned long i;
    unsigned int j;
    unsigned char cnt, wr = 0;
    for (i=0; i<65536L; ++i) {
        cnt = 0;
        for (j=0; j<4096; ++j) {
            if (input[j] == i) ++cnt;
        }
        if (cnt==1) { // Order doesn't matter
            output[wr++] = i;     
        }
    }
    P3_5 = 0;
    while (1);
}

According to test this runs for 04:19:36.997507. Also see the example ASM solution that solve in several seconds.

For extreme push, I'll run the simulation on Proteus with such circuit: and check the time when DBT1 is actived.

You can use all ROM in 0000-DFFF.
You can assume RAM is initialized to 0.
You can use SFRs, including pins, as RAM
You don't need to jmp $ after clearing P3.5
You can write trash into outer RAM 08-FF, they'll be ignored

with 128B RAM - Is stack memory included in the 128 bytes? Is the code loaded into the 128 bytes? — Digital Trauma, Commented Apr 14, 2021 at 14:36
@StackMeter the question says You can answer in C or ASM, but if you have a compiler for any other language for the described 8051 target, then I would assume that would also be acceptable. — Digital Trauma, Commented Apr 14, 2021 at 16:12
Here's an idea, if anyone wants to develop it. By xor-ing all the numbers, we can find the xor of the four unique numbers in very little space. Similarly, we can apply any function to the four of them and get the xor of the results. One idea is use xor multiplcation, thinking of the numbers as polynomials with coefficients mod 2. I think that xor-summing the first, second, third, and fourth powers of the numbers as polynomials should let you recover the four distinct values from the system of four equations. — xnor, Commented Apr 14, 2021 at 18:40

tsh · Accepted Answer · 2021-04-16 07:12:15Z

C, 0.480298s

Codes

#ifndef LOCAL_DEBUG
#include <reg51.h>
sbit P3_5 = P3 ^ 5;
unsigned int code input[4096] _at_ 0xE000;
unsigned int pdata output[4] _at_ 0x00;
#else

#include <stdio.h>
#define int short

#define UNIQ1 17
#define UNIQ2 273
#define UNIQ3 1298
#define UNIQ4 1554

unsigned int input[4096] = {
UNIQ1,
UNIQ2,
UNIQ3,
UNIQ4
};
unsigned int output[4];

#endif

#define INPUT_SIZE 4096
#define BUCKET_BITS 4
#define BUCKET_SIZE (1 << (BUCKET_BITS))
#define BUCKET_MASK ((BUCKET_SIZE) - 1)

unsigned int buckets[BUCKET_SIZE];

#define buckets_value(n) (buckets[n] & ~(bucket_mask)) | ((n) << bit_offset)

void main() {
  unsigned int output_offset = 0;
  unsigned int i, j, k, l;
  unsigned int search_bucket = 0, search_bucket_2 = 0, bit_offset = 0, bit_offset_2, bucket_mask;
  unsigned int target_mask, target_value;
  do {
    bucket_mask = BUCKET_MASK << bit_offset;
    for (i = 0; i < BUCKET_SIZE; ++i) {
      buckets[i] = 0;
    }
    for (i = 0; i < INPUT_SIZE; ++i) {
      buckets[(input[i] >> bit_offset) & BUCKET_MASK] ^= input[i] | bucket_mask;
    }
    for (i = j = 0; i < BUCKET_SIZE; ++i) {
      if (buckets[i] != 0) ++j;
    }
    if (j <= 1) {
      bit_offset += BUCKET_BITS;
    }
  } while (j <= 1);
  if (j == 4) {
    for (i = 0; i < BUCKET_SIZE; ++i) {
      if (buckets[i] == 0) continue;
      output[output_offset++] = buckets_value(i);
    }
  } else if (j == 3) {
    for (i = 0; i < BUCKET_SIZE; ++i) {
      if (buckets[i] == 0) continue;
      if (buckets[i] & (BUCKET_MASK << bit_offset)) {
        output[output_offset++] = buckets_value(i);
      } else {
        search_bucket = i << bit_offset;
      }
    }
  } else if (j == 2) {
    for (i = k = 0; i < BUCKET_SIZE; ++i) {
      if (buckets[i] == 0) continue;
      if (buckets[i] & (BUCKET_MASK << bit_offset)) {
        l = buckets_value(i);
        if (k == 0) {
          for (j = 0; j < INPUT_SIZE; ++j) {
            if (input[j] == l) ++k;
          }
          if (k == 1) {
            output[output_offset++] = l;
          } else {
            search_bucket = i << bit_offset;
            k = 2;
          }
        } else if (k == 1) {
          search_bucket = i;
        } else {
          output[output_offset++] = l;
        }
      } else {
        if (search_bucket) {
          search_bucket_2 = i << bit_offset;
        } else {
          search_bucket = i << bit_offset;
        }
      }
    }
  }
  bit_offset_2 = bit_offset;
  target_mask = BUCKET_MASK << bit_offset;
  target_value = search_bucket;
  while (output_offset < 4) {
    bit_offset += BUCKET_BITS;
    bucket_mask = BUCKET_MASK << bit_offset;
    for (i = 0; i < BUCKET_SIZE; ++i) {
      buckets[i] = 0;
    }
    for (i = 0; i < INPUT_SIZE; ++i) {
      if ((input[i] & target_mask) != target_value) continue;
      buckets[(input[i] >> bit_offset) & BUCKET_MASK] ^= input[i] | bucket_mask;
    }
    for (i = j = 0; i < BUCKET_SIZE; ++i) {
      if (buckets[i] != 0) ++j;
    }
    if (j == 1) {
      continue;
    }
    for (i = 0; i < BUCKET_SIZE; ++i) {
      if (buckets[i] == 0) continue;
      if ((buckets[i] >> bit_offset) & BUCKET_MASK) {
        output[output_offset++] = buckets_value(i);
        --j;
      } else {
        target_mask |= bucket_mask;
        target_value |= i << bit_offset;
      }
    }
    if (j == 0) {
      bit_offset = bit_offset_2;
      target_mask = BUCKET_MASK << bit_offset;
      target_value = search_bucket_2;
    }
  }
#ifndef LOCAL_DEBUG
  P3_5 = 0;
  while (1);
#else
  for (i = 0; i < 4; i++) {
    printf("%d\n", output[i]);
  }
#endif
}

Try it online!

I'm not sure about its correctness. Please comment if you find out some failed testcases. I will try my best to fix it.

Algorithm

Algorithm here only works when input array contains 4 unique values. It will fail if input array contains more than 4 values.

Consider xor of some numbers where most numbers occurred 2 times, and some occurred once. It is easy to prove: The xor result is zero if and only if we have at least 3 unique numbers here. Also, we try to count how many numbers at all. It is trivial to know: If the count is even / odd, then we have even / odd number of unique numbers here.

As there are only 4 unique numbers according to the description of this question. If both xor result is zero and count result is even, there should be 0 or 4 unique numbers.

By using this idea, we need to split numbers into some buckets while keep equals number in same buckets. You can do it by (input[i] >> n) & m. So we first split all values input[i] into 16 buckets based on (input[i] >> 0) & 15. We use input[i] | bucket_mask to mask out the bits for bucket name. We set them to 1s, so they are used to know count number in that buckets is odd or even.

for (i = 0; i < INPUT_SIZE; ++i) {
  buckets[(input[i] >> bit_offset) & BUCKET_MASK] ^= input[i] | bucket_mask;
}

We use above code. And after calculate these buckets:

If buckets[i] & bucket_mask is non-zero: we know the bucket i contains 1 or 3 unique numbers (number count is odd).
If buckets[i] & bucket_mask is zero, but buckets[i] is non-zero: we know the bucket i contains 2 unique numbers.
If buckets[i] is zero, and some other bucket bucket[j] is non-zero: we know the bucket i contains no unique numbers.
If all buckets are zero: 4 unique numbers are located in the same bucket. We need to try another 4 bits as mask and run the loop again.

After we know where the number is in some bucket:

If the bucket contains only 1 unique numbers: the unique number is (buckets[i] & ~(bucket_mask)) | ((i) << bit_offset);
If the bucket contains more than 1 unique numbers: We loop again on numbers in this bucket, and split them into different buckets based on other bits;

Analysis

Time complexity is \$O\left(size \cdot bit / buckets + 2^{buckets} \right)\$. \$size\$ is how many numbers the input contains. \$bit\$ is how many bits each input contains. \$bucket\$ is how many bits for bucket id (4 in above codes).

Extra storage complexity is \$O\left(2^{buckets}\right)\$.

\$\begingroup\$ Runtime 0.480298s \$\endgroup\$
– l4m2
Commented Apr 15, 2021 at 15:46 — l4m2, Commented Apr 15, 2021 at 15:46

l4m2 · Accepted Answer · 2021-04-14 01:07:36Z

5.003017s

    PAG equ R7
    APAG equ 7
    WRPTR equ R0

    org 0
    ; mov PAG, a  ; using that RAM is cleared
    ; mov WRPTR, a
mainloop:
    mov dptr, #0E000h
    ; clr 64-127
    clr a
    _ptr set 64
    rept 64
        mov _ptr, a
        _ptr set _ptr+1
    endm
scan:   
    rept 128
        local exit
        clr a
        movc a, @a+dptr
        inc dpl
        add a, PAG
        add a, acc
        jnz exit
        clr a
        movc a, @a+dptr
        mov b, a
        orl b, #0D8h
        clr b.5
        mov b.2, c  ; 11011caa
        anl a, #0FCh
        mov 9, #exit/256
        mov 8, #exit mod 256
        mov 11, b
        mov 10, a
        mov sp, #11
        ret
exit:   ;inc dptr
        inc dpl
    endm
    inc dph
    mov a, dph
    jz not_scan
    jmp scan
not_scan:
    _ptr set 0
    xrl APAG, #0FFh
    inc PAG
    rept 64
        local j7, j6, j5, j4, j3, j2, j1, j0
        mov b, _ptr/8+64
        jnb b.7, j7
        mov a, PAG
        movx @WRPTR, a
        inc WRPTR
        mov a, #_ptr mod 256+7
        movx @WRPTR, a
        inc WRPTR
j7:     jnb b.6, j6
        mov a, PAG
        movx @WRPTR, a
        inc WRPTR
        mov a, #_ptr mod 256+6
        movx @WRPTR, a
        inc WRPTR
j6:     jnb b.5, j5
        mov a, PAG
        movx @WRPTR, a
        inc WRPTR
        mov a, #_ptr mod 256+5
        movx @WRPTR, a
        inc WRPTR
j5:     jnb b.4, j4
        mov a, PAG
        movx @WRPTR, a
        inc WRPTR
        mov a, #_ptr mod 256+4
        movx @WRPTR, a
        inc WRPTR
j4:     jnb b.3, j3
        mov a, PAG
        movx @WRPTR, a
        inc WRPTR
        mov a, #_ptr mod 256+3
        movx @WRPTR, a
        inc WRPTR
j3:     jnb b.2, j2
        mov a, PAG
        movx @WRPTR, a
        inc WRPTR
        mov a, #_ptr mod 256+2
        movx @WRPTR, a
        inc WRPTR
j2:     jnb b.1, j1
        mov a, PAG
        movx @WRPTR, a
        inc WRPTR
        mov a, #_ptr mod 256+1
        movx @WRPTR, a
        inc WRPTR
j1:     jnb b.0, j0
        mov a, PAG
        movx @WRPTR, a
        inc WRPTR
        mov a, #_ptr mod 256+0
        movx @WRPTR, a
        inc WRPTR
j0:     
        _ptr set _ptr + 8
        if (_ptr = 256)
            orl APAG, #080h
        endif
    endm
    inc PAG
    cjne PAG, #0, mainloopJ
    clr P3.5
    jmp $
mainloopJ:      
    xrl APAG, #07Fh
    inc PAG
    jmp mainloop

    org 0D800h
    rept 512
        _oa set (03FCh and $) mod 255 + ($>=0DC00h)*256
        xrl 64+_oa/8, #1 shl (_oa mod 8)
        ret
    endm

I know SFRs are enough to make 1024 tested numbers in a scan, but as this is an example solution, and I expect some much faster solutions than bit[N]&scan

Digital Trauma · Accepted Answer · 2021-04-14 18:25:46Z

C, 87s

I don't have the means to test or even compile for the environment, so there are likely issues and further optimisations. I've no idea if the code (and stack?) will fit in 128B.

There are a couple of optimisations over the naïve C example:

4096x4096 iterations instead of 65536x4096 iterations.
XOR trick - XOR all the input values together. Then once we've found 3 out of 4 non-duplicate values, these may be XORed against the xor accumulator one more time to cancel them out and leave the 4th non-duplicate value remaining.

There are likely better algorithms than this, but it's a start.

Hand-written assembly will certainly beat this, but for now, I don't know 8051 assembly at all, so I'll have to settle for C.

I am assuming in this architecture, sizeof(int) == 2 and sizeof(long) == 4.

#include <reg51.h>
sbit P3_5 = P3^5;
unsigned int code input[4096] _at_ 0xE000;
unsigned int pdata output[4] _at_ 0x00;
void main() {
    unsigned int i, j;
    unsigned char dupfound, wr = 0;
    unsigned int xor_acum = 0;
    unsigned int v;
    unsigned int output_stage[3];

    for (i=0; i<4096; ++i) {
        v = input[i];
        xor_acum ^= v;
        if (wr < 3) {
            dupfound = 0;
            for (j=0; j<4096; ++j) {
                if (i != j && v == input[j]) {
                    dupfound = 1;
                    break;
                }
            }
            if (!dupfound) {
                output_stage[wr++] = v;
            }
        }
    }
    output[0] = output_stage[0];
    xor_acum ^= output_stage[0];
    output[1] = output_stage[1];
    xor_acum ^= output_stage[1];
    output[2] = output_stage[2];
    xor_acum ^= output_stage[2];
    output[3] = xor_acum;

    P3_5 = 0;
    while (1);
}

Thanks for testing. Is that 1min 26.6secs?. To be honest, I'm a little astonished this compiles, runs and presumably gives the right answer? Does it fit in the 128byte constraint? I certainly didn't expect a ~179x speedup compared to the naïve 4h 19m 37s - at best I thought in the 20x region. — Digital Trauma, Commented Apr 14, 2021 at 18:33
Get why you ~179x speedup. The third unique element is on position 1475, so you'll get worse case — l4m2, Commented Apr 15, 2021 at 1:49

Stack Exchange Network

Fastest unique finder

3 Answers 3

C, 0.480298s

Codes

Algorithm

Analysis

5.003017s

C, 87s

Not the answer you're looking for? Browse other questions tagged
fastest-code
restricted-memory
or ask your own question.

Linked

Hot Network Questions

Fastest unique finder

3 Answers 3

C, 0.480298s

Codes

Algorithm

Analysis

5.003017s

C, 87s

Not the answer you're looking for? Browse other questions tagged fastest-coderestricted-memory or ask your own question.

Linked

Related

Hot Network Questions

Not the answer you're looking for? Browse other questions tagged
fastest-code
restricted-memory
or ask your own question.