Title: there are n sentences, each sentence is less than a length equal to m, now need to be adjacent to the shorter sentences splicing together again, so that the minimum number of sentences, and the length is still not greater than m, and the length of the sentence after the completion of the splicing the minimum variance. Seeking splicing.
Problem-solving (ourselves out of the question, sent_comb3 segmentation in line with requirements):
# -*- coding: utf-8 -*- import numpy as np DEF sent_comb1 (sent_lens, max_len = 35 ): "" " The average length of the total length of the first request, and then slicing " "" IF SUM (sent_lens) <= max_len: return [List (sent_lens)] avg_len = sum(sent_lens) // np.ceil(sum(sent_lens) / max_len) rlts = [] for sent_len in sent_lens: if (not rlts) or (rlts and sum(rlts[-1]) >= avg_len): rlts.append([sent_len]) elif sum(rlts[-1]) + sent_len <= max_len: rlts[-1].append(sent_len) else: rlts.append([sent_len]) return rlts DEF sent_comb2 (sent_lens, max_len = 35 ): "" " to seek an average length of each of the two, and then slicing " "" IF SUM (sent_lens) <= max_len: return [List (sent_lens)] small_sent_lens = [] for sent_len in sent_lens: if sum(small_sent_lens) + sent_len > 2 * max_len: break small_sent_lens.append(sent_len) small_sent_lens_sum = sum(small_sent_lens) avg_len = small_sent_lens_sum // np.ceil(small_sent_lens_sum / max_len) rlts = [] rlt = [] while len(sent_lens): sent_len = sent_lens[0] if sum(rlt) >= avg_len: rlts.append(rlt) rlts.extend(sent_comb2(sent_lens)) break elif sum(rlt) + sent_len <= max_len: rlt.append(sent_len) sent_lens = sent_lens[1:] else: rlts.append(rlt) rlts.extend(sent_comb2(sent_lens)) break return rlts DEF sent_comb3 (sent_lens, max_len = 35 ): "" " before segmentation, and then determine the average length of each of the two, and then slicing, repeat the above steps until the results of the two same stopping operation " "" IF SUM (sent_lens ) <= max_len: return [List (sent_lens)] rlts = [] for sent_len in sent_lens: if not rlts: rlts.append([sent_len]) elif sum(rlts[-1]) + sent_len <= max_len: rlts[-1].append(sent_len) else: rlts.append([sent_len]) while True: new_rlts = [rlts[0]] i = 1 while i < len(rlts): small_sent_lens = new_rlts[-1] + rlts[i] avg_len = sum(small_sent_lens) // 2 rlt = [] while len(small_sent_lens): sent_len = small_sent_lens[0] if sum(rlt) >= avg_len: break elif sum(rlt) + sent_len <= max_len: rlt.append(sent_len) small_sent_lens = small_sent_lens[1:] else: break new_rlts[-1] = rlt new_rlts.append(small_sent_lens) i += 1 if new_rlts == rlts: break rlts = new_rlts return rlts def main(): max_len = 35 # sent_lens = np.random.randint(1, max_len, size=20) sent_lens = [7, 11, 12, 31, 1, 1, 26, 2, 7, 22, 1, 14, 28, 1, 1, 34, 24, 32, 10, 31] # sent_lens = [34, 1, 34, 1, 30, 1] print(sent_lens) import TIME start_time = timeit.default_timer() comb_rlt1 = sent_comb1(sent_lens, max_len=max_len) print(timeit.default_timer() - start_time) print(comb_rlt1) # print([sum(comb) for comb in comb_rlt1]) start_time = timeit.default_timer() comb_rlt2 = sent_comb2(sent_lens, max_len=max_len) print(timeit.default_timer() - start_time) print(comb_rlt2) # print([sum(comb) for comb in comb_rlt2]) start_time = timeit.default_timer() comb_rlt3 = sent_comb3(sent_lens, max_len=max_len) print(timeit.default_timer() - start_time) print(comb_rlt3) # print([sum(comb) for comb in comb_rlt3]) if __name__ == "__main__": main()