基于POI检查 excel 文件的行数、列数是否超出限制

用 POI 解析 xlsx 时,如果用这样的方式 Workbook workbook = WorkbookFactory.create(new File("super-many-row.xlsx")); 解析具有超过 100 万行的 excel ,很可能内存就被耗尽了。

因为这样虽然简单,但是要等整个 excel 都解析完成、转换成 POI 定义的对象后才会返回到业务代码,这时候才判断行数是否超过限制就晚了。

需要一种更高效的方式,在解析完成前就判断是否超过限制,了解了 POI 的 API 和 xlsx 的基本格式后,就有了下面这个工具类,基于 POI 3.13。

2016.02.28 更新:增加对 sheet 数量的校验才够完整。


/**
 * 检查 excel 文件的行数、列数是否超出限制
 *
 */
public class RowColLimitCheck {
    /**
     * 表示不检测的长度
     */
    public static final int UN_LIMITED = -1;

    /**
     * 表示 excel 超出最大行数、列数的异常类
     */
    @SuppressWarnings("serial")
    static class RowColLimitBreakException extends RuntimeException {
        public RowColLimitBreakException(String msg) {
            super(msg);
        }
    }

    /**
     * 检测 HSSF(xls)格式的 excel 文件是否超过最大行数、列数的事件监听类
     */
    static class ColRowLimitHSSFListener implements HSSFListener {
        private int maxSheetCount;
        private int maxRowCount;
        private int maxColCount;
        private int sheetCount;
        private int rowCount;

        public ColRowLimitHSSFListener(int maxSheetCount, int maxRowCount,
                int maxColCount) {
            this.maxRowCount = maxRowCount;
            this.maxColCount = maxColCount;
        }

        @Override
        public void processRecord(Record record) {
            if (record.getSid() == BoundSheetRecord.sid) {
                if ((++sheetCount) > maxSheetCount) {
                    throw new RowColLimitBreakException("sheet_exceed");
                }
                rowCount = 0; // 新的sheet,重置计数
            } else if (record.getSid() == RowRecord.sid) {
                if (maxRowCount != UN_LIMITED && (rowCount++) > maxRowCount) {
                    throw new RowColLimitBreakException("row_exceed");
                }

                RowRecord recordRow = (RowRecord) record;
                if (maxColCount != UN_LIMITED
                        && recordRow.getLastCol() > maxColCount) {
                    throw new RowColLimitBreakException("col_exceed");
                }
            }
        }
    }

    /**
     * 检测后缀为 xls 的excel 是否满足最大行数、列数的限制。
     *
     * @param file
     *            excel 文件路径
     * @param maxSheetCount
     *            允许的最大sheet数
     * @param maxRowCount
     *            允许的最大行数
     * @param maxColCount
     *            允许的最大列数
     * @return
     * null: 表示没有问题;
     * "sheet_exceed":sheet数量超过限制;
     * "row_exceed":行数超出限制;
     * "col_exceed":列数超出限制
     * 
     * @throws IOException
     */
    public static String checkRowColCount(InputStream fin, String postFix,
            int maxSheetCount, int maxRowCount, int maxColCount)
                    throws Exception {
        if (!postFix.equalsIgnoreCase("xls")
                && !postFix.equalsIgnoreCase("xlsx")) {
            throw new IllegalArgumentException("not xls or xlsx file");
        }

        if (maxRowCount == UN_LIMITED && maxColCount == UN_LIMITED
                && maxSheetCount == UN_LIMITED) {
            return null;
        }

        if (postFix.equals("xls")) {
            return checkXls(fin, maxSheetCount, maxRowCount, maxColCount);
        } else {
            return checkXlsx(fin, maxSheetCount, maxRowCount, maxColCount);
        }
    }

    private static String checkXls(InputStream fin, int maxSheetCount,
            int maxRowCount, int maxColCount) throws IOException {
        InputStream din = null;
        POIFSFileSystem poifs = new POIFSFileSystem(fin);
        try {
            din = poifs.createDocumentInputStream("Workbook");
            HSSFRequest req = new HSSFRequest();
            req.addListenerForAllRecords(new ColRowLimitHSSFListener(
                    maxSheetCount, maxRowCount, maxColCount));

            HSSFEventFactory factory = new HSSFEventFactory();
            factory.processEvents(req, din);
            return null;
        } catch (RowColLimitBreakException ex) {
            return ex.getMessage();
        } finally {
            IOUtils.closeQuietly(din);
            poifs.close();
        }
    }

    /**
     * 标签元素名含义:
     * "sheetData":sheet 标签;
     * "row":行标签;
     * "c":单元格;
     * "v":单元格的值
     */
    static class SheetHandler extends DefaultHandler {
        private int maxRowCount;
        private int maxColCount;
        private int rowCount;
        private int colCount;
        private List valueList = new LinkedList();
        private StringBuilder cellValue = new StringBuilder(32);

        public SheetHandler(int maxRowCount, int maxColCount) {
            this.maxRowCount = maxRowCount;
            this.maxColCount = maxColCount;
        }

        private boolean isRowElement(String name) {
            return "row".equals(name);
        }

        private boolean isValueElement(String name) {
            return "v".equals(name);
        }

        public void characters(char[] ch, int start, int length)
                throws SAXException {
            cellValue.append(new String(ch, start, length));
        }

        public void startElement(String uri, String localName, String name,
                Attributes attributes) throws SAXException {
            if (isValueElement(name)) {
                // 开始新的单元格,清除之前的值
                cellValue.delete(0, cellValue.length());

                // 值
                colCount++;
                if (maxColCount != UN_LIMITED && colCount > maxColCount) {
                    throw new RowColLimitBreakException("col_exceed");
                }
            }
        }

        public void endElement(String uri, String localName, String name)
                throws SAXException {
            if (isValueElement(name)) {
                valueList.add(cellValue.toString());
            }

            if (isRowElement(name)) {
                // 新的记录行开始
                if (colCount > 0) {
                    rowCount++;
                }

                if (maxRowCount != UN_LIMITED && rowCount > maxRowCount) {
                    System.out.println(rowCount + ":" + valueList);
                    throw new RowColLimitBreakException("row_exceed");
                }

                valueList.clear();
                colCount = 0;
            }
        }

        private void reset() {
            rowCount = 0;
            colCount = 0;
        }
    }

    private static String checkXlsx(InputStream fin, int maxSheetCount,
            int maxRowCount, int maxColCount) throws Exception {
        OPCPackage pkg = OPCPackage.open(fin); // 耗时
        try {
            XSSFReader xssfReader = new XSSFReader(pkg);
            XMLReader parser = XMLReaderFactory
                    .createXMLReader("org.apache.xerces.parsers.SAXParser");
            SheetHandler handler = new SheetHandler(maxRowCount, maxColCount);
            parser.setContentHandler(handler);

            Iterator sheetsData = xssfReader.getSheetsData();
            int sheetNum = 0;
            while (sheetsData.hasNext()) { // 检查每个 Sheet
                if ((++sheetNum) > maxSheetCount) {
                    return "sheet_exceed";
                }
                InputStream sheetStream = (InputStream) sheetsData.next();
                InputSource sheetSource = new InputSource(sheetStream);
                parser.parse(sheetSource);
                handler.reset();
                // IOUtils.closeQuietly(sheetStream);
            }
            return null;
        } catch (RowColLimitBreakException e) {
            return e.getMessage();
        } finally {
            IOUtils.closeQuietly(pkg);
        }
    }
}

发表评论

电子邮件地址不会被公开。 必填项已用*标注

此站点使用Akismet来减少垃圾评论。了解我们如何处理您的评论数据