最近工作中用到文本分析,有的文本足足有好几个G,一次性加载的内存中肯定不合适,所以在读取文件时使用缓冲区来分批读取文件
import java.io.BufferedInputStream;import java.io.BufferedReader;import java.io.File;import java.io.FileInputStream;import java.io.FileWriter;import java.io.IOException;import java.io.InputStreamReader;public class ReadBigFile { public static void main(String[] args) throws Exception { readBigTxt(); } private static void readBigTxt() { //原文件路径 String filePath = "E:/BaiduYunDownload/dict.dic"; //输出的文件路径 String outputFile = "E:/BaiduYunDownload/dict2.dic"; File file = new File(filePath); BufferedInputStream bis = null; BufferedReader in = null; FileWriter fw = null; try { bis = new BufferedInputStream(new FileInputStream(file)); //每次读取10M到缓冲区 in = new BufferedReader(new InputStreamReader(bis, "UTF-8"), 10 * 1024 * 1024); //输出的文件 fw = new FileWriter(outputFile); int count = 0; while (in.ready()) { //当count>20000的时候退出循环,只是测试,不需要读取太多行数据 if (count > 20000) { break; } String line = in.readLine(); fw.append(line + "\n"); count++; } } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } finally { try { if (in != null) { in.close(); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } try { if (fw != null) { fw.flush(); fw.close(); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } try { if (bis != null) { bis.close(); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } System.out.println("文件输出完成"); }}
为了测试,从网盘上把以前破解WIFI密码时用的字典下载了下来,文件有10G,读取时完全没有问题,机器没有出现卡顿现象。