Reading a WARC file
Out of date API
Note that this example uses an out of date API and will not compile against e.g. JWAT 0.9.1. See the JWAT-Tools source code for more up-to-date examples.
Not final!
TestWarc.java
import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.util.Collection; import java.util.Iterator; import org.jwat.warc.WarcReader; import org.jwat.warc.WarcReaderFactory; import org.jwat.warc.WarcRecord; import org.jwat.warc.WarcValidationError; public class TestWarc { static String warcFile = "/home/nicl/Downloads/IAH-20080430204825-00000-blackbook.warc"; //static String warcFile = "/home/nicl/Downloads/MYWARC.warc"; public static void main(String[] args) { File file = new File( warcFile ); try { InputStream in = new FileInputStream( file ); int records = 0; int errors = 0; WarcReader reader = WarcReaderFactory.getReader( in ); WarcRecord record; while ( (record = reader.getNextRecord()) != null ) { printRecord(record); printRecordErrors(record); ++records; if (record.hasErrors()) { errors += record.getValidationErrors().size(); } } System.out.println("--------------"); System.out.println(" Records: " + records); System.out.println(" Errors: " + errors); reader.close(); in.close(); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } public static void printRecord(WarcRecord record) { System.out.println("--------------"); System.out.println(" Version: " + record.bMagicIdentified + " " + record.bVersionParsed + " " + record.major + "." + record.minor); System.out.println(" TypeIdx: " + record.warcTypeIdx); System.out.println(" Type: " + record.warcTypeStr); System.out.println(" Filename: " + record.warcFilename); System.out.println(" Record-ID: " + record.warcRecordIdUri); System.out.println(" Date: " + record.warcDate); System.out.println("Content-Length: " + record.contentLength); System.out.println(" Content-Type: " + record.contentType); System.out.println(" Truncated: " + record.warcTruncatedStr); System.out.println(" InetAddress: " + record.warcInetAddress); System.out.println(" ConcurrentTo: " + record.warcConcurrentToUriList); System.out.println(" RefersTo: " + record.warcRefersToUri); System.out.println(" TargetUri: " + record.warcTargetUriUri); System.out.println(" WarcInfo-Id: " + record.warcWarcInfoIdUri); System.out.println(" BlockDigest: " + record.warcBlockDigest); System.out.println(" PayloadDigest: " + record.warcPayloadDigest); System.out.println("IdentPloadType: " + record.warcIdentifiedPayloadType); System.out.println(" Profile: " + record.warcProfileStr); System.out.println(" Segment#: " + record.warcSegmentNumber); System.out.println(" SegmentOrg-Id: " + record.warcSegmentOriginIdUrl); System.out.println("SegmentTLength: " + record.warcSegmentTotalLength); } public static void printRecordErrors(WarcRecord record) { if (record.hasErrors()) { Collection<WarcValidationError> errorCol = record.getValidationErrors(); if (errorCol != null && errorCol.size() > 0) { Iterator<WarcValidationError> iter = errorCol.iterator(); while (iter.hasNext()) { WarcValidationError error = iter.next(); System.out.println( error.error ); System.out.println( error.field ); System.out.println( error.value ); } } } } }