[此贴子已经被作者于2022-9-16 14:37编辑过]
[此贴子已经被作者于2022-9-17 11:05编辑过]
#include <iostream> #include <fstream> #include <string> #include <sstream> #include<locale.h> #include"iconv.h" #pragma comment(lib, "libiconv.lib") #pragma warning(disable:4996) using namespace std; typedef unsigned int Uint32; enum TEXT_TYPE { TEXT_ANSI = 0, TEXT_UTF8 = 1, TEXT_UTF8_BOM = 2, TEXT_UTF16_LE = 3, TEXT_UTF16_BE = 4, TEXT_UNKNOW = 5, }; class File { public: File() { _ptr_file = nullptr; } File(const char* fn, const char* opr) { open(fn, opr); } ~File() { close(); } //读取文件到内存 static char* read_file(Uint32& size, const char* fn) { File tmp_file; if (tmp_file.open(fn, "rb")) { return size = 0, nullptr; } size = tmp_file.get_file_size(); char* buf = (char*)malloc(size + 4);///这里多分配几个字节做字符串结尾 if (!buf) { return size = 0, nullptr; } memset(buf, 0, size + 4); ///内存填充0 size = tmp_file.read_byte(buf, size); return buf; } ///获取文本文件类型 ///文件类型:utf-8返回0 /// utf8 bom 返回1 /// ucs-2 BE大端字节序返回2 /// USC-2 LE小端字节序返回3 /// 文件不存在返回 -1 /// static int get_file_type(const char* fn) { File temp_file; if (temp_file.open(fn, "rb")) { return -1; } if (temp_file.get_file_size() <= 2) { return 0; } char en_buf[3] = { 0 }; temp_file.read_byte(en_buf, 3); if (0 == memcmp(en_buf, "\xEF\xBB\xBF", 3)) { return 1; } else if (0 == memcmp(en_buf, "\xFE\xFF", 2)) { return 2; } else if (0 == memcmp(en_buf, "\xFF\xFE", 2)) { return 3; } return 0; } int open(const char* fn, const char* opr) { close(); _ptr_file = fopen(fn, opr); if (!_ptr_file) { return-1; } return 0; } void close() { if (!_ptr_file) { return; } fclose((FILE*)_ptr_file); } ///获取文件大小 Uint32 get_file_size() { int cur_pos = ftell((FILE*)_ptr_file); fseek((FILE*)_ptr_file, 0L, SEEK_END); Uint32 ret = ftell((FILE*)_ptr_file); fseek((FILE*)_ptr_file, cur_pos, SEEK_SET); return ret; } ///读取size个字节到dst/// Uint32 read_byte(char* dst, Uint32 size) { Uint32 min_size = min(size, get_file_size()); fread(dst, 1, min_size, (FILE*)_ptr_file); return min_size; } private: void* _ptr_file; //DISALLOW_COPY_AND_ASSIGN(File) private: File(File&); File& operator=(File&); }; //检查是否为无BOM的UTF8 bool check_utf8_without_bom(const string& file_name) { ifstream file_in; file_in.open(file_name, ios::in); if (!file_in.is_open()) { cout << "打开文件失败" << endl; return false; } stringstream buffer; buffer << file_in.rdbuf(); file_in.close(); string text = buffer.str(); size_t len = text.size(); int n = 0; unsigned char ch; bool b_all_ascii = true; //0x00-0x7F为ASCII码范围 for (size_t i = 0; i < len; ++i) { ch = text[i]; if ((ch & 0x80) != 0) { b_all_ascii = false; } if (n == 0) { if (ch >= 0x80) { if (ch >= 0xFC && ch <= 0xFD) { n = 6; } else if (ch >= 0xF8) { n = 5; } else if (ch >= 0xF0) { n = 4; } else if (ch >= 0xE0) { n = 3; } else if (ch >= 0xC0) { n = 2; } else { return false; } n--; } } else { if ((ch & 0xC0) != 0x80)//在UTF-8中,以位模式10开始的所有字节是多字节序列的后续字节 { return false; } n--; } } if (n > 0) { return false; } if (b_all_ascii) { return false; } return true; } //检查文本编码 TEXT_TYPE check_text_encode(const string& file_name) { /* ANSI 无格式定义 对于中文编码格式是GB2312; Unicode little endian 文本里前两个字节为FF FE 字节流是little endian Unicode big endian 文本里前两个字节为FE FF 字节流是big endian UTF-8带BOM 前两字节为EF BB,第三字节为BF 带BOM UTF-8不带BOM 无格式定义,需另加判断 不带BOM */ ifstream file_in(file_name, ios::binary); if (!file_in.is_open()) { cout << "打开文件失败" << endl;; return TEXT_UNKNOW; } int head; unsigned char ch; file_in.read((char*)&ch, sizeof(ch)); head = ch << 8; file_in.read((char*)&ch, sizeof(ch)); head |= ch; file_in.close(); TEXT_TYPE result_code; switch (head) { case 0xFFFE: result_code = TEXT_UTF16_LE; break; case 0xFEFF: result_code = TEXT_UTF16_BE; break; case 0xEFBB: result_code = TEXT_UTF8_BOM; break; default: if (check_utf8_without_bom(file_name)) result_code = TEXT_UTF8; else result_code = TEXT_ANSI; break; } return result_code; } int code_convert(char* to_chatset, char* from_charset, const char* inbuf, size_t inlen, char* outbuf, rsize_t outlen) { iconv_t cd, err = (iconv_t)-1; cd = iconv_open(to_chatset, from_charset); if (cd == err)return -1; int ret = iconv(cd, &inbuf, &inlen, &outbuf, &outlen); if (ret == -1)return -1; iconv_close(cd); return outlen; } void ReadFile(string filepath) { Uint32 size; char* buffer = nullptr; char* newbuffer = nullptr; Uint32 newsize; TEXT_TYPE type = check_text_encode(filepath); if (type != TEXT_UNKNOW) { size = 0; buffer = File::read_file(size, filepath.c_str()); switch (type) { case TEXT_ANSI: cout << std::string(buffer); break; case TEXT_UTF8: newsize = size; newbuffer = new char[newsize]; memset(newbuffer, 0, newsize); code_convert((char*)"GBK", (char*)"UTF-8", buffer, size, newbuffer, newsize); cout << std::string(newbuffer); free(newbuffer); break; case TEXT_UTF8_BOM: newsize = size; newbuffer = new char[newsize]; memset(newbuffer, 0, newsize); code_convert((char*)"GBK", (char*)"UTF-8", buffer + 3, size-3, newbuffer, newsize); cout << std::string(newbuffer); free(newbuffer); break; case TEXT_UTF16_LE: newsize = size; newbuffer = new char[newsize]; memset(newbuffer, 0, newsize); code_convert((char*)"GBK", (char*)"UTF-16LE", buffer+2, size-2, newbuffer, newsize); cout << std::string(newbuffer); free(newbuffer); break; case TEXT_UTF16_BE: newsize = size; newbuffer = new char[newsize]; memset(newbuffer, 0, newsize); code_convert((char*)"GBK", (char*)"UTF-16BE", buffer+2, size-2, newbuffer, newsize); cout << std::string(newbuffer); free(newbuffer); break; default: break; } free(buffer); } } int main(int argc, char* argv[]) { string File1 = "C:\\TestFile\\ANSI.txt"; string File2 = "C:\\TestFile\\UTF-8.txt"; string File3 = "C:\\TestFile\\UTF-8 BOM.txt"; string File4 = "C:\\TestFile\\UTF-16 LE.txt"; string File5 = "C:\\TestFile\\UTF-16 BE.txt"; ReadFile(File1); ReadFile(File2); ReadFile(File3); ReadFile(File4); ReadFile(File5); return 0; }