Array Segmentation (splice sentences)

Title: there are n sentences, each sentence is less than a length equal to m, now need to be adjacent to the shorter sentences splicing together again, so that the minimum number of sentences, and the length is still not greater than m, and the length of the sentence after the completion of the splicing the minimum variance. Seeking splicing.

 Problem-solving (ourselves out of the question, sent_comb3 segmentation in line with requirements):

# -*- coding: utf-8 -*-

import numpy as np


DEF sent_comb1 (sent_lens, max_len = 35 ):
     "" " The average length of the total length of the first request, and then slicing " "" 
    IF SUM (sent_lens) <= max_len:
         return [List (sent_lens)]
    avg_len = sum(sent_lens) // np.ceil(sum(sent_lens) / max_len)
    rlts = []
    for sent_len in sent_lens:
        if (not rlts) or (rlts and sum(rlts[-1]) >= avg_len):
            rlts.append([sent_len])
        elif sum(rlts[-1]) + sent_len <= max_len:
            rlts[-1].append(sent_len)
        else:
            rlts.append([sent_len])
    return rlts


DEF sent_comb2 (sent_lens, max_len = 35 ):
     "" " to seek an average length of each of the two, and then slicing " "" 
    IF SUM (sent_lens) <= max_len:
         return [List (sent_lens)]
    small_sent_lens = []
    for sent_len in sent_lens:
        if sum(small_sent_lens) + sent_len > 2 * max_len:
            break
        small_sent_lens.append(sent_len)
    small_sent_lens_sum = sum(small_sent_lens)
    avg_len = small_sent_lens_sum // np.ceil(small_sent_lens_sum / max_len)
    rlts = []
    rlt = []
    while len(sent_lens):
        sent_len = sent_lens[0]
        if sum(rlt) >= avg_len:
            rlts.append(rlt)
            rlts.extend(sent_comb2(sent_lens))
            break
        elif sum(rlt) + sent_len <= max_len:
            rlt.append(sent_len)
            sent_lens = sent_lens[1:]
        else:
            rlts.append(rlt)
            rlts.extend(sent_comb2(sent_lens))
            break
    return rlts


DEF sent_comb3 (sent_lens, max_len = 35 ):
     "" " before segmentation, and then determine the average length of each of the two, and then slicing, repeat the above steps until the results of the two same stopping operation " "" 
    IF SUM (sent_lens ) <= max_len:
         return [List (sent_lens)]
    rlts = []
    for sent_len in sent_lens:
        if not rlts:
            rlts.append([sent_len])
        elif sum(rlts[-1]) + sent_len <= max_len:
            rlts[-1].append(sent_len)
        else:
            rlts.append([sent_len])
    while True:
        new_rlts = [rlts[0]]
        i = 1
        while i < len(rlts):
            small_sent_lens = new_rlts[-1] + rlts[i]
            avg_len = sum(small_sent_lens) // 2
            rlt = []
            while len(small_sent_lens):
                sent_len = small_sent_lens[0]
                if sum(rlt) >= avg_len:
                    break
                elif sum(rlt) + sent_len <= max_len:
                    rlt.append(sent_len)
                    small_sent_lens = small_sent_lens[1:]
                else:
                    break
            new_rlts[-1] = rlt
            new_rlts.append(small_sent_lens)
            i += 1
        if new_rlts == rlts:
            break
        rlts = new_rlts
    return rlts


def main():
    max_len = 35
    # sent_lens = np.random.randint(1, max_len, size=20)
    sent_lens = [7, 11, 12, 31,  1,  1, 26,  2,  7, 22,  1, 14, 28,  1,  1, 34, 24, 32, 10, 31]
    # sent_lens = [34, 1, 34, 1, 30, 1]
    print(sent_lens)

    import TIME

    start_time = timeit.default_timer()
    comb_rlt1 = sent_comb1(sent_lens, max_len=max_len)
    print(timeit.default_timer() - start_time)
    print(comb_rlt1)
    # print([sum(comb) for comb in comb_rlt1])

    start_time = timeit.default_timer()
    comb_rlt2 = sent_comb2(sent_lens, max_len=max_len)
    print(timeit.default_timer() - start_time)
    print(comb_rlt2)
    # print([sum(comb) for comb in comb_rlt2])

    start_time = timeit.default_timer()
    comb_rlt3 = sent_comb3(sent_lens, max_len=max_len)
    print(timeit.default_timer() - start_time)
    print(comb_rlt3)
    # print([sum(comb) for comb in comb_rlt3])


if __name__ == "__main__":
    main()

 

Guess you like

Origin www.cnblogs.com/jacen789/p/12070493.html