You can group the items by first letter and just search the sublists, no string can start with a substring unless it has at least the same first letter:
from collections import defaultdict
def find(l):
d = defaultdict(list)
# group by first letter
for ele in l:
d[ele[0]].append(ele)
for val in d.values():
for v in val:
# check each substring in the sublist
if not any(v.startswith(s) and v != s for s in val):
yield v
print(list(find(l)))
['sdfdg', 'xc', 'ab']
This correctly filters the words, as you can see from the code below that the reduce function does not, 'tool'
should not be in the output:
In [56]: l = ["tool",'ab',"too", 'xc', 'abb',"toot", 'abed',"abel", 'sdfdg', 'abfdsdg', 'xccc',"xcew","xrew"]
In [57]: reduce(r,l)
Out[57]: ['tool', 'ab', 'too', 'xc', 'sdfdg', 'xrew']
In [58]: list(find(l))
Out[58]: ['sdfdg', 'too', 'xc', 'xrew', 'ab']
It also does it efficiently:
In [59]: l = ["".join(sample(ascii_lowercase, randint(2,25))) for _ in range(5000)]
In [60]: timeit reduce(r,l)
1 loops, best of 3: 2.12 s per loop
In [61]: timeit list(find(l))
1 loops, best of 3: 203 ms per loop
In [66]: %%timeit
..... result = []
....: for element in lst:
....: is_prefixed = False
....: for possible_prefix in lst:
....: if element is not possible_prefix and element.startswith(possible_prefix):
....: is_prefixed = True
....: break
....: if not is_prefixed:
....: result.append(element)
....:
1 loops, best of 3: 4.39 s per loop
In [92]: timeit list(my_filter(l))
1 loops, best of 3: 2.94 s per loop
If you know the minimum string length is always > 1 you can optimise further, again if the minimum length string is 2
then a word has to have a minimum of the first two letters in common:
def find(l):
d = defaultdict(list)
# find shortest length string to use as key length
mn = len(min(l, key=len))
for ele in l:
d[ele[:mn]].append(ele)
for val in d.values():
for v in val:
if not any(v.startswith(s) and v != s for s in val):
yield v
In [84]: timeit list(find(l))
100 loops, best of 3: 14.6 ms per loop
Lastly if you have dupes you may want to filter out the duplicated words from your list but you need to keep them to compare:
from collections import defaultdict,Counter
def find(l):
d = defaultdict(list)
mn = len(min(l, key=len))
cn = Counter(l)
for ele in l:
d[ele[:mn]].append(ele)
for val in d.values():
for v in val:
if not any(v.startswith(s) and v != s for s in val):
# make sure v is not a dupe
if cn[v] == 1:
yield v
So if speed is important, an implementation using some variation of the code above is going to be significantly faster than your naive approach. There is also more data stored in memory so you should also take the into account.
To save memory we can create a counter for each val/sublist so we only store a single counter dict of words at a time:
def find(l):
d = defaultdict(list)
mn = len(min(l, key=len))
for ele in l:
d[ele[:mn]].append(ele)
for val in d.values():
# we only need check each grouping of words for dupes
cn = Counter(val)
for v in val:
if not any(v.startswith(s) and v != s for s in val):
if cn[v] == 1:
yield v
creating a dict each loop adds 5ms so still < 20ms for 5k words.
The reduce method should work if the data is sorted:
reduce(r,sorted(l)) # -> ['ab', 'sdfdg', 'too', 'xc', 'xrew']
To make the difference clear between the behaviour:
In [202]: l = ["tool",'ab',"too", 'xc', 'abb',"toot", 'abed',
"abel", 'sdfdg', 'abfdsdg', 'xccc',"xcew","xrew","ab"]
In [203]: list(filter_list(l))
Out[203]: ['ab', 'too', 'xc', 'sdfdg', 'xrew', 'ab']
In [204]: list(find(l))
Out[204]: ['sdfdg', 'too', 'xc', 'xrew', 'ab', 'ab']
In [205]: reduce(r,sorted(l))
Out[205]: ['ab', 'sdfdg', 'too', 'xc', 'xrew']
In [206]: list(find_dupe(l))
Out[206]: ['too', 'xrew', 'xc', 'sdfdg']
In [207]: list(my_filter(l))
Out[207]: ['sdfdg', 'xrew', 'too', 'xc']
In [208]: "ab".startswith("ab")
Out[208]: True
So ab
is repeated twice so using a set or a dict without keeping track of how may times ab
appeared would mean we consider that there was no other element that satisfied the condition ab
"ab".startswith(other ) == True
, which we can see is incorrect.
You can also use itertools.groupby to group based on the min index size:
def find_dupe(l):
l.sort()
mn = len(min(l, key=len))
for k, val in groupby(l, key=lambda x: x[:mn]):
val = list(val)
for v in val:
cn = Counter(val)
if not any(v.startswith(s) and v != s for s in val) and cn[v] == 1:
yield v
Based on your comments then we can adjust my first code if you don't think "dd".startswith("dd")
should be True with repeated elements:
l = ['abbb', 'xc', 'abb', 'abed', 'sdfdg', 'xc','abfdsdg', 'xccc', 'd','dd','sdfdg', 'xc','abfdsdg', 'xccc', 'd','dd']
def find_with_dupe(l):
d = defaultdict(list)
# group by first letter
srt = sorted(set(l))
ind = len(srt[0])
for ele in srt:
d[ele[:ind]].append(ele)
for val in d.values():
for v in val:
# check each substring in the sublist
if not any(v.startswith(s) and v != s for s in val):
yield v
print(list(find_with_dupe(l)))
['abfdsdg', 'abed', 'abb', 'd', 'sdfdg', 'xc']
Which run on a random sample of text runs in a fraction of the time your own code does:
In [15]: l = open("/home/padraic/Downloads/sample.txt").read().split()
In [16]: timeit list(find(l))
100 loops, best of 3: 19 ms per loop
In [17]: %%timeit
....: l = open("/home/padraic/Downloads/sample.txt").read().split()
....: for i in range(0, len(l) - 1):
....: for j in range(i + 1, len(l)):
....: if l[j].startswith(l[i]):
....: l[j] = l[i]
....: else:
....: if l[i].startswith(l[j]):
....: l[i] = l[j]
....:
1 loops, best of 3: 4.92 s per loop
Both returning identical output:
In [41]: l = open("/home/padraic/Downloads/sample.txt").read().split()
In [42]:
for i in range(0, len(l) - 1):
for j in range(i + 1, len(l)):
if l[j].startswith(l[i]):
l[j] = l[i]
else:
if l[i].startswith(l[j]):
l[i] = l[j]
....:
In [43]:
In [43]: l2 = open("/home/padraic/Downloads/sample.txt").read().split()
In [44]: sorted(set(l)) == sorted(find(l2))
Out[44]: True