用 POI 解析 xlsx 时,如果用这样的方式 Workbook workbook = WorkbookFactory.create(new File("super-many-row.xlsx"));
解析具有超过 100 万行的 excel ,很可能内存就被耗尽了。
因为这样虽然简单,但是要等整个 excel 都解析完成、转换成 POI 定义的对象后才会返回到业务代码,这时候才判断行数是否超过限制就晚了。
需要一种更高效的方式,在解析完成前就判断是否超过限制,了解了 POI 的 API 和 xlsx 的基本格式后,就有了下面这个工具类,基于 POI 3.13。
2016.02.28 更新:增加对 sheet 数量的校验才够完整。
/**
* 检查 excel 文件的行数、列数是否超出限制
*
*/
public class RowColLimitCheck {
/**
* 表示不检测的长度
*/
public static final int UN_LIMITED = -1;
/**
* 表示 excel 超出最大行数、列数的异常类
*/
@SuppressWarnings("serial")
static class RowColLimitBreakException extends RuntimeException {
public RowColLimitBreakException(String msg) {
super(msg);
}
}
/**
* 检测 HSSF(xls)格式的 excel 文件是否超过最大行数、列数的事件监听类
*/
static class ColRowLimitHSSFListener implements HSSFListener {
private int maxSheetCount;
private int maxRowCount;
private int maxColCount;
private int sheetCount;
private int rowCount;
public ColRowLimitHSSFListener(int maxSheetCount, int maxRowCount,
int maxColCount) {
this.maxRowCount = maxRowCount;
this.maxColCount = maxColCount;
}
@Override
public void processRecord(Record record) {
if (record.getSid() == BoundSheetRecord.sid) {
if ((++sheetCount) > maxSheetCount) {
throw new RowColLimitBreakException("sheet_exceed");
}
rowCount = 0; // 新的sheet,重置计数
} else if (record.getSid() == RowRecord.sid) {
if (maxRowCount != UN_LIMITED && (rowCount++) > maxRowCount) {
throw new RowColLimitBreakException("row_exceed");
}
RowRecord recordRow = (RowRecord) record;
if (maxColCount != UN_LIMITED
&& recordRow.getLastCol() > maxColCount) {
throw new RowColLimitBreakException("col_exceed");
}
}
}
}
/**
* 检测后缀为 xls 的excel 是否满足最大行数、列数的限制。
*
* @param file
* excel 文件路径
* @param maxSheetCount
* 允许的最大sheet数
* @param maxRowCount
* 允许的最大行数
* @param maxColCount
* 允许的最大列数
* @return
* null: 表示没有问题;
* "sheet_exceed":sheet数量超过限制;
* "row_exceed":行数超出限制;
* "col_exceed":列数超出限制
*
* @throws IOException
*/
public static String checkRowColCount(InputStream fin, String postFix,
int maxSheetCount, int maxRowCount, int maxColCount)
throws Exception {
if (!postFix.equalsIgnoreCase("xls")
&& !postFix.equalsIgnoreCase("xlsx")) {
throw new IllegalArgumentException("not xls or xlsx file");
}
if (maxRowCount == UN_LIMITED && maxColCount == UN_LIMITED
&& maxSheetCount == UN_LIMITED) {
return null;
}
if (postFix.equals("xls")) {
return checkXls(fin, maxSheetCount, maxRowCount, maxColCount);
} else {
return checkXlsx(fin, maxSheetCount, maxRowCount, maxColCount);
}
}
private static String checkXls(InputStream fin, int maxSheetCount,
int maxRowCount, int maxColCount) throws IOException {
InputStream din = null;
POIFSFileSystem poifs = new POIFSFileSystem(fin);
try {
din = poifs.createDocumentInputStream("Workbook");
HSSFRequest req = new HSSFRequest();
req.addListenerForAllRecords(new ColRowLimitHSSFListener(
maxSheetCount, maxRowCount, maxColCount));
HSSFEventFactory factory = new HSSFEventFactory();
factory.processEvents(req, din);
return null;
} catch (RowColLimitBreakException ex) {
return ex.getMessage();
} finally {
IOUtils.closeQuietly(din);
poifs.close();
}
}
/**
* 标签元素名含义:
* "sheetData":sheet 标签;
* "row":行标签;
* "c":单元格;
* "v":单元格的值
*/
static class SheetHandler extends DefaultHandler {
private int maxRowCount;
private int maxColCount;
private int rowCount;
private int colCount;
private List valueList = new LinkedList();
private StringBuilder cellValue = new StringBuilder(32);
public SheetHandler(int maxRowCount, int maxColCount) {
this.maxRowCount = maxRowCount;
this.maxColCount = maxColCount;
}
private boolean isRowElement(String name) {
return "row".equals(name);
}
private boolean isValueElement(String name) {
return "v".equals(name);
}
public void characters(char[] ch, int start, int length)
throws SAXException {
cellValue.append(new String(ch, start, length));
}
public void startElement(String uri, String localName, String name,
Attributes attributes) throws SAXException {
if (isValueElement(name)) {
// 开始新的单元格,清除之前的值
cellValue.delete(0, cellValue.length());
// 值
colCount++;
if (maxColCount != UN_LIMITED && colCount > maxColCount) {
throw new RowColLimitBreakException("col_exceed");
}
}
}
public void endElement(String uri, String localName, String name)
throws SAXException {
if (isValueElement(name)) {
valueList.add(cellValue.toString());
}
if (isRowElement(name)) {
// 新的记录行开始
if (colCount > 0) {
rowCount++;
}
if (maxRowCount != UN_LIMITED && rowCount > maxRowCount) {
System.out.println(rowCount + ":" + valueList);
throw new RowColLimitBreakException("row_exceed");
}
valueList.clear();
colCount = 0;
}
}
private void reset() {
rowCount = 0;
colCount = 0;
}
}
private static String checkXlsx(InputStream fin, int maxSheetCount,
int maxRowCount, int maxColCount) throws Exception {
OPCPackage pkg = OPCPackage.open(fin); // 耗时
try {
XSSFReader xssfReader = new XSSFReader(pkg);
XMLReader parser = XMLReaderFactory
.createXMLReader("org.apache.xerces.parsers.SAXParser");
SheetHandler handler = new SheetHandler(maxRowCount, maxColCount);
parser.setContentHandler(handler);
Iterator sheetsData = xssfReader.getSheetsData();
int sheetNum = 0;
while (sheetsData.hasNext()) { // 检查每个 Sheet
if ((++sheetNum) > maxSheetCount) {
return "sheet_exceed";
}
InputStream sheetStream = (InputStream) sheetsData.next();
InputSource sheetSource = new InputSource(sheetStream);
parser.parse(sheetSource);
handler.reset();
// IOUtils.closeQuietly(sheetStream);
}
return null;
} catch (RowColLimitBreakException e) {
return e.getMessage();
} finally {
IOUtils.closeQuietly(pkg);
}
}
}
欢迎关注我的微信公众号: coderbee笔记,可以更及时回复你的讨论。