Heap sorting + TopK problem - "data structure and algorithm"

How are you uu from CSDN, long time no see, I stopped updating for a long time, recently Xiao Yalan will start to update slowly, below, let’s enter the knowledge shared by Xiao Yalan today, let us Let's enter the world of Heap together! ! !


Heap sort - (1)

Contents of heap.h:

#pragma once
#include<stdio.h>
#include<stdlib.h>
#include<assert.h>
#include<stdbool.h>
typedef int HeapDataType;
typedef struct Heap
{
	HeapDataType* a;
	int size;
	int capacity;
}Heap;
//堆的初始化
void HeapInit(Heap* php);
//堆的销毁
void HeapDestroy(Heap* php);
//插入数据
void HeapPush(Heap* php, HeapDataType x);
//向上调整算法
void AdjustUp(HeapDataType* a, int child);
//删除堆顶数据
void HeapPop(Heap* php);
//向下调整算法
void AdjustDown(int* a, int n, int parent);
//判空
bool HeapEmpty(Heap* php);
//堆顶元素
HeapDataType HeapTop(Heap* php);
//元素个数
int HeapSize(Heap* php);

Contents of heap.c:

#include"heap.h"
//堆的初始化
void HeapInit(Heap* php)
{
	assert(php);
	php->a = NULL;
	php->size = 0;
	php->capacity = 0;
}
//堆的销毁
void HeapDestroy(Heap* php)
{
	assert(php);
	free(php->a);
	php->a = NULL;
	php->size = 0;
	php->capacity = 0;
}
//交换数据
void Swap(HeapDataType* p1, HeapDataType* p2)
{
	HeapDataType tmp = *p1;
	*p1 = *p2;
	*p2 = tmp;
}
//向上调整算法
void AdjustUp(HeapDataType* a, int child)
{
	int parent = (child - 1) / 2;
	while (child > 0)
	{
		//小根堆
		if (a[child] < a[parent])
		{
			Swap(&a[child], &a[parent]);
			child = parent;
			parent = (child - 1) / 2;
		}
		else
		{
			break;
		}
	}
}
//插入数据
void HeapPush(Heap* php, HeapDataType x)
{
	assert(php);
	//扩容
	if (php->size == php->capacity)
	{
		int newcapacity = php->capacity == 0 ? 4 : php->capacity * 2;
		HeapDataType* tmp = (HeapDataType*)realloc(php->a, newcapacity * sizeof(HeapDataType));
		if (tmp == NULL)
		{
			perror("realloc fail");
			return;
		}
		php->a = tmp;
		php->capacity = newcapacity;
	}
	php->a[php->size] = x;
	php->size++;
	AdjustUp(php->a, php->size - 1);
}
//向下调整算法
//这边写int* 而不写HeapDataType* 是有意为之的 为以后堆排序作准备
void AdjustDown(int* a, int n, int parent)
{
	//默认左孩子小
	int child = parent * 2 + 1;
	while (child < n)//孩子在数组范围内
	{
		//选出左右孩子中小/大的那一个
		//有可能假设错了
		//左孩子不存在,一定没有右孩子——完全二叉树
		//左孩子存在,有可能没有右孩子
		if ( child + 1 < n && a[child + 1] < a[child])
		//	右孩子存在			右孩子<左孩子
		//不能这么写 if (la[child + 1] < a[chid] && child + 1 < n )
		//这样写会有越界的风险 因为是先访问了数组中的元素 再去比较右孩子是否存在
		{
			++child;
		}
		//child就是小的那个孩子
		//不关心到底是左孩子还是右孩子 小根堆:和小的孩子比较就可以了
		if (a[child] < a[parent])
		{
			Swap(&a[child], &a[parent]);
			child = parent;
			child = parent * 2 + 1;//默认又算的是左孩子
		}
		else
		{
			break;
		}

	}
}
//判空
bool HeapEmpty(Heap* php)
{
	assert(php);
	if (php->size == 0)
	{
		return true;
	}
	else
	{
		return false;
	}
}
//删除堆顶数据
void HeapPop(Heap* php)
{
	assert(php);
	assert(!HeapEmpty(php));
	Swap(&php->a[0], &php->a[php->size - 1]);
	php->size--;
	AdjustDown(php->a, php->size, 0);
}
//堆顶元素
HeapDataType HeapTop(Heap* php)
{
	assert(php);
	assert(!HeapEmpty(php));
	return php->a[0];
}
//元素个数
int HeapSize(Heap* php)
{
	assert(php);
	return php->size;
}

Contents of test.c:

void HeapSort(int* a, int n)
{
	Heap hp;
	HeapInit(&hp);
	int i = 0;
	for (i = 0; i < n; i++)
	{
		HeapPush(&hp, a[i]);
	}
	while (!HeapEmpty(&hp))
	{
		int top = HeapTop(&hp);
		a[i++] = top;
		HeapPop(&hp);
	}
	HeapDestroy(&hp);
}
int main()
{
	int a[] = { 7,8,3,5,1,9,5,4 };
	int sz = sizeof(a) / sizeof(a[0]);
	HeapSort(a, sz);
	return 0;
}

Such a heap sort is actually possible

But there are downsides! ! !

The first one: there must be a heap first, which is too troublesome

The second: the space complexity is too high, and copy data

Heap sort - (2)

First of all, you still have to build a heap! ! !

The first method: adjust the heap upwards

//建堆——向上调整建堆
int i = 0;
for (i = 1; i < n; i++)
{
	AdjustUp(a, i);
}

If building a small heap in ascending order:

 

So ascending order needs to build a large pile 

 

Here is to say that the descending order needs to build a small heap 

void HeapSort(int* a, int n)
{
	//建堆——向上调整建堆
	int i = 0;
	for (i = 1; i < n; i++)
	{
		AdjustUp(a, i);
	}
	//升序——建大堆
	//降序——建小堆
	int end = n - 1;
	while (end > 0)
	{
		Swap(&a[0], &a[end]);
		AdjustDown(a, end, 0);
		--end;
	}
}

 

The second method: adjust the heap downward 

 

//建堆——向下调整建堆
int i = 0;
for (i = (n - 1 - 1) / 2; i >= 0; i--)
{
	AdjustDown(a, n, i);
}

Complete heap sort code:

void HeapSort(int* a, int n)
{
    //建堆——向下调整建堆
	int i = 0;
	for (i = (n - 1 - 1) / 2; i >= 0; i--)
	{
		AdjustDown(a, n, i);
	}
	//升序——建大堆
	//降序——建小堆
	int end = n - 1;
	while (end > 0)
	{
		Swap(&a[0], &a[end]);
		AdjustDown(a, end, 0);
		--end;
	}
}

 


Downscaled time complexity 

 More nodes, fewer downward adjustments, fewer nodes, more downward adjustments 

 The last layer does not need to be adjusted, so start calculating from the penultimate layer

A common mathematical method is used here - dislocation subtraction

 

 

Time complexity for scaling up

More nodes, more times of upward adjustments, less nodes, less times of upward adjustments

Therefore, compared with the efficiency of adjusting the heap upwards and adjusting the heap downwards, the upward adjustment is much lower

 

 

 


TopK questions

 TOP-K problem: Find the top K largest elements or smallest elements in the data combination. Generally, the amount of data is relatively large.

For example: the top 10 professional players, the world's top 500, the rich list, the top 100 active players in the game, etc.

For the Top-K problem, the most simple and direct way that can be thought of is sorting, but: if the amount of data is very large, sorting is not advisable (it may not be possible to load all the data into memory at once). The best way is to use the heap to solve it. The basic idea is as follows:

Use the first K elements in the data set to build a heap

  • For the first k largest elements, build a small heap
  • For the first k smallest elements, build a large heap 

Use the remaining NK elements to compare with the top elements in turn, and replace the top elements if they are not satisfied

        After comparing the remaining NK elements with the top elements of the heap in turn, the remaining K elements in the heap are the first K smallest or largest elements sought.

 

If there is a lot of data, the data is stored in the disk file

 

void CreateNDate()
{
	// 造数据
	int n = 10000;
	srand(time(0));
	const char* file = "data.txt";
	FILE* fin = fopen(file, "w");
	if (fin == NULL)
	{
		perror("fopen error");
		return;
	}
	for (size_t i = 0; i < n; ++i)
	{
		int x = rand() % 1000000;
		fprintf(fin, "%d\n", x);
	}
	fclose(fin);
}
void PrintTopK(int k)
{
	const char* file = "data.txt";
	FILE* fout = fopen(file, "r");
	if (fout == NULL)
	{
		perror("fopen error");
		return;
	}
	int* kminheap = (int*)malloc(sizeof(int) * k);
	if (kminheap == NULL)
	{
		perror("malloc error");
		return;
	}
	for (int i = 0; i < k; i++)
	{
		fscanf(fout, "%d", &kminheap[i]);
	}
	// 建小堆
	for (int i = (k - 1 - 1) / 2; i >= 0; i--)
	{
		AdjustDown(kminheap, k, i);
	}
	int val = 0;
	while (!feof(fout))
	{
		fscanf(fout, "%d", &val);
		if (val > kminheap[0])
		{
			kminheap[0] = val;
			AdjustDown(kminheap, k, 0);
		}
	}
	for (int i = 0; i < k; i++)
	{
		printf("%d ", kminheap[i]);
	}
	printf("\n");
}

Alright, this is the end of Xiao Yalan's learning content for today, it's too bad, I still have to keep working hard! ! !

 

Guess you like

Origin blog.csdn.net/weixin_74957752/article/details/131376172