/**
* ===========================================
* Java Pdf Extraction Decoding Access Library
* ===========================================
*
* Project Info: http://www.jpedal.org
* (C) Copyright 1997-2008, IDRsolutions and Contributors.
*
* This file is part of JPedal
*
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
* ---------------
* FindTextInRectangle.java
* ---------------
*/
package org.jpedal.examples.text;
//JFC
import java.awt.Rectangle;
import java.io.File;
import java.io.FileWriter;
import java.io.PrintWriter;
import org.jpedal.PdfDecoder;
import org.jpedal.exception.PdfException;
import org.jpedal.grouping.PdfGroupingAlgorithms;
import org.jpedal.grouping.SearchType;
import org.jpedal.objects.PdfPageData;
import org.jpedal.utils.LogWriter;
/**
*
* Sample code showing how jpedal library can be used with
* pdf files to find text from a specified Rectangle.
* Can XML file of the output.
*
* Debugging tip: Set verbose=true in LogWriter to see what is going on.
*/
public class FindTextInRectangle
{
/**return value for testing*/
float[] co_ords=null;
/**area to search*/
int[] areaToScan=null;
/**correct separator for OS */
String separator = System.getProperty("file.separator");
/**the decoder object which decodes the pdf and returns a data object*/
PdfDecoder decodePdf = null;
/**word to find*/
private String textToFind = null;
/** xml output file*/
private File xmlOutputFile;
private static String xmlOutputPath;
/**Output control flags. Default settings*/
private static boolean enableXML = false;
private static boolean enableSTDout = true;
public FindTextInRectangle()
{
}
/**example method to open a file and extract the raw text*/
public FindTextInRectangle(String file_name, String textToFind)
{
this.textToFind=textToFind;
findText(file_name);
}
/**example method to open a file and extract the raw text*/
public FindTextInRectangle(String file_name, String textToFind, int[] areaToScan)
{
this.textToFind=textToFind;
this.areaToScan=areaToScan;
findText(file_name);
}
private void findText(String file_name)
{
createXMLFile(true);
/**
* if file name ends pdf, do the file otherwise
* do every pdf file in the directory. We already know file or
* directory exists so no need to check that, but we do need to
* check its a directory
*/
File targetFile = new File(file_name);
if (file_name.toLowerCase().endsWith(".pdf")) {
decodeFile(file_name);
}
else if(targetFile.isDirectory()) {
//get list of files and check directory
String[] files = targetFile.list();
//make sure name ends with a deliminator for correct path later
if (!file_name.endsWith(separator)) {
file_name = file_name + separator;
}
//now work through all pdf files
long fileCount = files.length;
for (int i = 0; i < fileCount; i++) {
if(enableSTDout) {
System.out.println("File "+ i + " of " + fileCount + ' ' + files[i]);
}
if (files[i].toLowerCase().endsWith(".pdf")) {
if(enableSTDout) {
System.out.println(file_name + files[i]);
}
decodeFile(file_name + files[i]);
}
}
}
else {
System.err.println(file_name + " is not a directory. Exiting program");
}
//close XML file
createXMLFile(false);
}
/**
* routine to decode a file
*/
private void decodeFile(String file_name)
{
/**debugging code to create a log
LogWriter.setupLogFile(true,0,"","v",false);
LogWriter.log_name = "/mnt/shared/log.txt";
/***/
createFileXMLElement(file_name, true);
//PdfDecoder returns a PdfException if there is a problem
try {
decodePdf = new PdfDecoder(false);
decodePdf.setExtractionMode(PdfDecoder.TEXT); //extract just text
decodePdf.init(true);
//make sure widths in data CRITICAL if we want to split lines correctly!!
/**
* open the file (and read metadata including pages in file)
*/
if(enableSTDout) {
System.out.println("Opening file: " + file_name);
}
decodePdf.openPdfFile(file_name);
}
catch (PdfException e) {
System.err.println("Ignoring " + file_name);
System.err.println("Due to: " + e);
createFileXMLElement(file_name, false);
return;
}
/**
* extract data from pdf (if allowed).
*/
if ((decodePdf.isEncrypted()&&(!decodePdf.isPasswordSupplied())) && (!decodePdf.isExtractionAllowed())) {
System.out.println("Encrypted settings");
System.out.println(
"Please look at SimpleViewer for code sample to handle such files");
System.out.println("Or get support/consultancy");
}
else {
//page range
int start = 1, end = decodePdf.getPageCount();
/**
* extract data from pdf
*/
try {
for (int page = start; page <= end; page++) { //read pages
if(enableSTDout) {
System.out.println("=========================");
System.out.println("Page "+page);
System.out.println("=========================");
}
//decode the page
decodePdf.decodePage(page);
/** create a grouping object to apply grouping to data*/
PdfGroupingAlgorithms currentGrouping =decodePdf.getGroupingObject();
if(currentGrouping!=null){
int x1,y1,x2,y2;
/**use whole page size for demo - get data from PageData object unless set*/
if(areaToScan==null){
PdfPageData currentPageData = decodePdf.getPdfPageData();
x1 = currentPageData.getMediaBoxX(page);
x2 = currentPageData.getMediaBoxWidth(page)+x1;
y2 = currentPageData.getMediaBoxY(page);
y1 = currentPageData.getMediaBoxHeight(page)+y2;
}else{
x1=areaToScan[0];
y1=areaToScan[1];
x2=areaToScan[2];
y2=areaToScan[3];
}
//tell user
if(enableSTDout) {
System.out.println(
"Scanning for text ("+textToFind+") rectangle ("
+ x1
+ ','
+ y1
+ ' '
+ x2
+ ','
+ y2
+ ')');
}
/**Co-ordinates are x1,y1 (top left hand corner), x2,y2(bottom right) */
/**co-ords for start of object are returned in float object.
* if not found co-ords=null
* if found co_ords[0]=x1, co_ords[1]=y
*/
float[] co_ords;
try{
co_ords =currentGrouping.findText(
new Rectangle(x1,y2,x2-x1,y1-y2),
page,
new String[]{textToFind},
SearchType.MUTLI_LINE_RESULTS);
this.co_ords=co_ords;
} catch (PdfException e) {
decodePdf.closePdfFile();
System.err.println("Ignoring " + file_name);
System.err.println("Due to: " + e);
createFileXMLElement(file_name, false);
return;
}
if (co_ords == null) {
if(enableSTDout) {
System.out.println("Text not found on page.");
}
}
else {
if(enableSTDout) {
System.out.println("Found " + (co_ords.length/5) + " on page.");
}
for(int i = 0; i <co_ords.length; i+=5) {
if(enableSTDout) {
System.out.println("Text found at "+co_ords[i]+", "+co_ords[i+1]);
}
createFindXMLElement(co_ords[i],co_ords[i+1],page);
}
}
}
}
//remove data once written out
decodePdf.flushObjectValues(false);
}
catch (Exception e) {
decodePdf.closePdfFile();
System.err.println("Exception: " + e.getMessage());
createFileXMLElement(file_name, false);
return;
}
/**
* flush data structures - not strictly required but included
* as example
*/
decodePdf.flushObjectValues(true); //flush any text data read
/**tell user*/
if(enableSTDout) {
System.out.println("File read...");
}
}
//Close file xml element
createFileXMLElement(file_name, false);
/**close the pdf file*/
decodePdf.closePdfFile();
}
public float[] getCoords()
{
return co_ords;
}
/**
* @param open True to create a new XML file. False to close tags on the new file.
*/
public void createXMLFile(boolean open) {
if(enableXML) {
if(open) {
xmlOutputFile = new File(xmlOutputPath);
if(xmlOutputFile.exists()) {
xmlOutputFile.delete();
try {
xmlOutputFile.createNewFile();
}
catch (Exception e) {
enableXML = false;
System.err.println("Unable to create XML file: " + e + "\n");
}
}
if(enableXML) {
try {
PrintWriter outputStream = new PrintWriter(new FileWriter(xmlOutputFile));
outputStream.println("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
outputStream.println("<search>");
outputStream.println("<term>" + textToFind + "</term>");
outputStream.close();
}
catch (Exception e) {
enableXML = false;
System.err.print("Failed to write to XML file: " + e + "\n");
}
}
}
else {
try {
PrintWriter outputStream = new PrintWriter(new FileWriter(xmlOutputFile, true));
outputStream.println("</search>");
outputStream.close();
}
catch (Exception e) {
System.err.print("Exception creating closing XML file: " + e + "\n");
}
}
}
}
/**
* @param filePath The contents of the files <path> element
* @param open True if a the opening of <file> element is required, otherwise close a file element
*/
public void createFileXMLElement(String filePath, boolean open)
{
if(enableXML) {
try {
PrintWriter outputStream = new PrintWriter(new FileWriter(xmlOutputFile, true));
if(open) {
outputStream.println("<file>");
outputStream.println("<path>" + filePath + "</path>");
}
else {
outputStream.println("</file>");
}
outputStream.close();
}
catch (Exception e) {
System.out.print("Creating new outputFile: " + e);
}
}
}
public void createFindXMLElement(float x, float y, int pageNo)
{
if(enableXML) {
try {
PrintWriter outputStream = new PrintWriter(new FileWriter(xmlOutputFile, true));
outputStream.println("<found>");
outputStream.println("<pageNo>" + pageNo + "</pageNo>");
outputStream.println("<x>" + x + "</x>");
outputStream.println("<y>" + y + "</y>");
outputStream.println("</found>");
outputStream.close();
}
catch (Exception e) {
System.out.print("Creating new outputFile: " + e);
}
}
}
//////////////////////////////////////////////////////////////////////////
/**
* main routine which checks for any files passed and runs the demo
*/
public static void main(String[] args) {
System.out.println("JPedal demo to search pdf files for text. Visit http://www.jpedal.org/support_egFTIR.php for further info.\n");
String searchPath = null;
String textToFind = null;
int parameterCount = 0;
//check user has passed us a filename, -c can be anywhere but other parameters have to be in order
if(args.length >1 && args.length<=4) {
for(int i = 0; i<args.length; i++) {
if(args[i].toLowerCase().equals("-c")) {
enableSTDout = false;
}
else {
switch (parameterCount) {
case 0: searchPath = args[i]; break;
case 1: textToFind = args[i]; break;
case 2: xmlOutputPath = args[i]; enableXML = true; break;
}
parameterCount++;
}
}
}
if(parameterCount<2 || parameterCount>3) {
System.out.println("Usage: FindTextInRectangle input string [xmlFile] [-c]");
System.out.println("\t\tinput\tThe pdf file or directory you wish to search.");
System.out.println("\t\tstring\tThe string to search for (Use quotes if it contains spaces).");
System.out.println("\nOptional parameters:");
System.out.println("\txmlOutput\tThe name you want to give the XML search output file");
System.out.println("\t[-c]\t\tadd -c to suppress output to console");
//@exit
System.exit(1);
}
if(enableSTDout) {
System.out.println("Search Target: " + searchPath + " Searching for: " + textToFind);
if(enableXML) {
System.out.println("XML File: " + xmlOutputPath);
}
}
//check file exists
File pdf_file = new File(searchPath);
//if file exists, open and get number of pages
if (!pdf_file.exists()) {
System.out.println("File " + searchPath + " not found");
//@exit
System.exit(1);
}
else {
new FindTextInRectangle(searchPath,textToFind);
}
}
}
|