StarDict 词典idx文件格式:
每一个条目在单词列表中包含陆续三个域:
word_str; // a utf-8 string terminated by '\0'.
// 一个 utf-8 编码字符串,以 '\0' 终止符结束。word_str 的长度将是小于 256 的
word_data_offset; // word data's offset in .dict file
// 单词数据在 .dict 文件中的偏移,
//If the version is "3.0.0" and "idxoffsetbits=64",
//word_data_offset will be 64-bits unsigned number in network byte order.
word_data_size; // word data's total size in .dict file
// 单词数据在 .dict 文件中的总大小,word_data_size should be 32-bits unsigned number
// in network byte order.
package main
import (
// "bufio"
"io"
"os"
"fmt"
"strconv"
)
func main() {
fi, err := os.Open("gaojihanyudacidian_fix.idx")/*现代汉语词典\\*/
if err != nil { panic(err) }
defer fi.Close()
fo, err := os.Create("output.txt")
if err != nil { panic(err) }
defer fo.Close()
/*每次只读4个字节*/
read_buf := make([]byte,4)
jiange := make([]byte,1)
huanghuang := make([]byte,1)
jiange[0]=9
huanghuang[0]=10
var pos,nextPos uint64 = 0,0
/*标记当下需要读取的是哪个域的数据
为1是word_str
为2是word_data_offset
为3是word_data_size*/
var setp int = 1
/*记录已经存储在word_str中的字符个数(不包括字符串最后的\0),也就是说下次从word_str[lenth_of_word_str]处开始存*/
var lenth_of_word_str,charNum int = 0,0
word_str := make([]byte,257)
var tmpChar byte =0
var word_data_offset uint64 = 0
var word_data_size uint64 = 0
count :=1
for{
pos=nextPos
count++
n,err := fi.ReadAt(read_buf,(int64)(pos))
if err != nil && err != io.EOF{
panic(err)
}
/*文件格式要求:读取数据小于4个字节时说明文件结束*/
if n < 4{
fmt.Printf("\nfinish read\n")
break
}
switch setp {
case 1:
// fmt.Println("1:")
tmpChar=read_buf[0]
/*如果第一个字符是‘\0’说明该步骤已经结束*/
if tmpChar != 0{
/*因为可能有逗号,所以可能只有1个或者俩个字节组成一个字符*/
if tmpChar < 128{
charNum=1/*charNum记录utf8编码的字符数*/
}else if tmpChar < 194{
panic(err)
}else if tmpChar < 224{
charNum=2
}else if tmpChar < 240{
charNum=3
}else{
panic(err)
}
read_buf[charNum]=0
str1 := (string)(read_buf[0:charNum+1])
copy(word_str[lenth_of_word_str:lenth_of_word_str+charNum], read_buf[:charNum])
lenth_of_word_str=lenth_of_word_str+charNum
nextPos=nextPos+(uint64)(charNum)
continue
}else{
word_str[lenth_of_word_str]=9
nextPos=nextPos+1
}
case 2:
word_data_offset =0
word_data_offset = word_data_offset+((uint64)(read_buf[0]))*16*16*16*16*16*16
word_data_offset = word_data_offset+((uint64)(read_buf[1]))*16*16*16*16
word_data_offset = word_data_offset+((uint64)(read_buf[2]))*16*16
word_data_offset = word_data_offset+(uint64)(read_buf[3])
nextPos=nextPos+4
// fmt.Printf("word_data_offset =%d \n",word_data_offset/*(uint64)(read_buf[3])*/)
case 3:
word_data_size =0
word_data_size = word_data_size+(uint64)(read_buf[0])*16*16*16*16*16*16
word_data_size = word_data_size+(uint64)(read_buf[1])*16*16*16*16
word_data_size = word_data_size+(uint64)(read_buf[2])*16*16
word_data_size = word_data_size+(uint64)(read_buf[3])
nextPos=nextPos+4
default:
}
/*现在可以写入了*/
if setp == 3{
if _,err := fo.Write( word_str[:lenth_of_word_str+1]); err != nil{
panic(err)
}
word_data_offset_str := strconv.FormatUint(word_data_offset,10)
word_data_size_str := strconv.FormatUint(word_data_size,10)
if _,err := fo.WriteString( word_data_offset_str); err != nil{
panic(err)
}
if _,err := fo.Write( jiange[0:1]); err != nil{
panic(err)
}
if _,err := fo.WriteString( word_data_size_str); err != nil{
panic(err)
}
if _,err := fo.Write( huanghuang[0:1]); err != nil{
panic(err)
}
lenth_of_word_str=0
}
setp=setp+1
if setp > 4{
setp = 1
}
}
}