Netarkivet template

Facebook harvest template KB/SB (version: 2011-11-30)

 <?xml version="1.0" encoding="UTF-8"?>

<crawl-order xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="heritrix_settings.xsd">

    <meta>

        <name>forsider_plus_2niveauer_facebook</name>

        <description>Profile to harvest the frontpages and two hops with double-slash removal.

        max-toe-threads=100,

        max-hops=2

        timeout-seconds=300

        pool-max-active=2

</description>

        <operator>Admin</operator>

        <organization/>

        <audience>TESTERS</audience>

        <date>20080118111217</date>

    </meta>

    <controller>

        <string name="settings-directory">settings</string>

        <string name="disk-path"/>

        <string name="logs-path">logs</string>

        <string name="checkpoints-path">checkpoints</string>

        <string name="state-path">state</string>

        <string name="scratch-path">scratch</string>

        <long name="max-bytes-download">0</long>

        <long name="max-document-download">0</long>

        <long>0</long>

        <integer name="max-toe-threads">50</integer>

        <integer name="recorder-out-buffer-bytes">4096</integer>

        <integer name="recorder-in-buffer-bytes">65536</integer>

        <integer name="bdb-cache-percent">0</integer>

        <!-- DecidingScope migrated from DomainScope -->

        <newObject name="scope" class="org.archive.crawler.deciderules.DecidingScope">

            <boolean name="enabled">true</boolean>

            <string name="seedsfile">seeds.txt</string>

            <boolean name="reread-seeds-on-config">true</boolean>

            <!-- DecideRuleSequence. Multiple DecideRules applied in order with last non-PASS the resulting decision -->

            <newObject name="decide-rules">

 

               <map>

                       <newObject name="rejectByDefault" class="org.archive.crawler.deciderules.RejectDecideRule"/>

                       <newObject name="acceptURIFromSeedDomains" class="dk.netarkivet.harvester.harvesting.OnNSDomainsDecideRule">

                               <string name="decision">ACCEPT</string>

                               <string/>

                               <boolean name="seeds-as-surt-prefixes">true</boolean>

                               <string/>

                               <boolean>false</boolean>

                               <boolean name="rebuild-on-reconfig">true</boolean>

                       </newObject>

 

                       <newObject class="org.archive.crawler.deciderules.TooManyHopsDecideRule">

                               <integer name="max-hops">25</integer>

                       </newObject>

                       <newObject name="rejectIfPathological" class="org.archive.crawler.deciderules.PathologicalPathDecideRule">

                               <integer name="max-repetitions">3</integer>

                       </newObject>

                       <newObject name="acceptIfTranscluded" class="org.archive.crawler.deciderules.TransclusionDecideRule">

                               <integer name="max-trans-hops">3</integer>

                               <integer name="max-speculative-hops">1</integer>

                       </newObject>

                       <newObject name="pathdepthfilter" class="org.archive.crawler.deciderules.TooManyPathSegmentsDecideRule">

                               <integer name="max-path-depth">20</integer>

                       </newObject>

 

                        <newObject name="webbyen">

                               <string name="decision">REJECT</string>

                               <string name="regexp">.*webbyen.*kontakt\.asp.*</string>

                       </newObject>

                       <newObject name="dr_dk">

                               <string name="decision">REJECT</string>

                               <string name="regexp">.*dr\.dk.*epg\.asp.*</string>

                       </newObject>

                       <newObject name="stiften">

                               <string name="decision">REJECT</string>

                               <string name="regexp">.*stiften.*adstream_mjx\.ads.*</string>

                       </newObject>

                       <newObject name="halibut_dk" class="org.archive.crawler.deciderules.MatchesRegExpDecideRule">

                               <string name="decision">REJECT</string>

                               <string name="regexp">.*halibut\.dk\/cgi-bin.*</string>

                       </newObject>

                       <newObject name="cybercomputer_dk" class="org.archive.crawler.deciderules.MatchesRegExpDecideRule">

                               <string name="decision">REJECT</string>

                               <string name="regexp">.*cybercomputer\.dk\/putikurv.*</string>

                       </newObject>

                       <newObject name="tawselovers_dk" class="org.archive.crawler.deciderules.MatchesRegExpDecideRule">

                               <string name="decision">REJECT</string>

                               <string name="regexp">.*tawselovers.*action=buy_now.*</string>

                       </newObject>

               </map> <!-- end rules -->

            </newObject> <!-- end decide-rules -->

        </newObject> <!-- End DecidingScope -->

        <map name="http-headers">

            <string name="user-agent">Mozilla/5.0 (compatible; heritrix/1.5.0-200506132127 +http://netarkivet.dk/website/info.html)</string>

            <string name="from">netarkivet-svar@netarkivet.dk</string>

        </map>

        <newObject name="robots-honoring-policy" class="org.archive.crawler.datamodel.RobotsHonoringPolicy">

            <string name="type">ignore</string>

            <boolean name="masquerade">false</boolean>

            <text name="custom-robots"/>

            <stringList name="user-agents">

            </stringList>

        </newObject>

        <newObject name="frontier" class="org.archive.crawler.frontier.BdbFrontier">

            <float name="delay-factor">1.0</float>

            <integer name="max-delay-ms">1000</integer>

            <integer name="min-delay-ms">300</integer>

            <integer name="max-retries">3</integer>

            <long name="retry-delay-seconds">300</long>

            <integer name="preference-embed-hops">1</integer>

            <integer name="total-bandwidth-usage-KB-sec">1500</integer>

            <integer name="max-per-host-bandwidth-usage-KB-sec">500</integer>

 

        <string name="queue-assignment-policy">dk.netarkivet.harvester.harvesting.DomainnameQueueAssignmentPolicy</string>

 

            <string name="force-queue-assignment"/>

            <boolean name="pause-at-start">false</boolean>

            <boolean name="pause-at-finish">false</boolean>

            <boolean name="source-tag-seeds">false</boolean>

            <boolean name="recovery-log-enabled">false</boolean>

            <boolean name="hold-queues">true</boolean>

            <integer name="balance-replenish-amount">3000</integer>

            <integer name="error-penalty-amount">100</integer>

            <long name="queue-total-budget">-1</long>

            <string name="cost-policy">org.archive.crawler.frontier.UnitCostAssignmentPolicy</string>

            <long name="snooze-deactivate-ms">300000</long>

            <integer name="target-ready-backlog">50</integer>

            <string name="uri-included-structure">org.archive.crawler.util.BdbUriUniqFilter</string>

            <boolean name="dump-pending-at-close">false</boolean>

        </newObject>

 

        <map name="uri-canonicalization-rules">

            <newObject name="Lowercase" class="org.archive.crawler.url.canonicalize.LowercaseRule">

                <boolean name="enabled">true</boolean>

            </newObject>

            <newObject name="Userinfo" class="org.archive.crawler.url.canonicalize.StripUserinfoRule">

                <boolean name="enabled">true</boolean>

            </newObject>

            <newObject name="WWW" class="org.archive.crawler.url.canonicalize.StripWWWRule">

                <boolean name="enabled">false</boolean>

            </newObject>

            <newObject name="SessionIDs" class="org.archive.crawler.url.canonicalize.StripSessionIDs">

                <boolean name="enabled">true</boolean>

            </newObject>

            <newObject name="QueryStrPrefix" class="org.archive.crawler.url.canonicalize.FixupQueryStr">

                <boolean name="enabled">true</boolean>

            </newObject>

        </map>

        <!-- Heritrix pre-fetch processors -->

        <map name="pre-fetch-processors">

 

            <newObject name="QuotaEnforcer" class="org.archive.crawler.prefetch.QuotaEnforcer">

                <boolean name="force-retire">false</boolean>

               <boolean name="enabled">true</boolean>

               <newObject name="QuotaEnforcer#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">

                       <map>

                       </map>

               </newObject>

              <long name="server-max-fetch-successes">-1</long>

              <long name="server-max-success-kb">-1</long>

              <long name="server-max-fetch-responses">-1</long>

              <long name="server-max-all-kb">-1</long>

 

              <long name="host-max-fetch-successes">-1</long>

              <long name="host-max-success-kb">-1</long>

              <long name="host-max-fetch-responses">-1</long>

              <long name="host-max-all-kb">-1</long>

 

              <long name="group-max-fetch-successes">-1</long>

              <long name="group-max-success-kb">-1</long>

              <long name="group-max-fetch-responses">-1</long>

              <long name="group-max-all-kb">-1</long>

 

            </newObject>

 

            <newObject name="Preselector" class="org.archive.crawler.prefetch.Preselector">

                <boolean name="enabled">true</boolean>

               <newObject name="Preselector#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">

                       <map>

                       </map>

               </newObject>

                <boolean name="override-logger">false</boolean>

                <boolean name="recheck-scope">true</boolean>

                <boolean name="block-all">false</boolean>

                <string name="block-by-regexp"/>

               <string name="allow-by-regexp"/>

            </newObject>

            <newObject name="Preprocessor" class="org.archive.crawler.prefetch.PreconditionEnforcer">

                <boolean name="enabled">true</boolean>

               <newObject name="Preprocessor#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">

                       <map>

                       </map>

               </newObject>

                <integer name="ip-validity-duration-seconds">21600</integer>

                <integer name="robot-validity-duration-seconds">86400</integer>

                <boolean name="calculate-robots-only">false</boolean>

            </newObject>

        </map> <!--End of Heritrix pre-fetch processors -->

        <!-- Heritrix fetch processors -->

        <map name="fetch-processors">

            <newObject name="DNS">

                <boolean name="enabled">true</boolean>

               <newObject name="DNS#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">

                       <map>

                       </map>

               </newObject>

                <boolean name="accept-non-dns-resolves">false</boolean>

               <boolean name="digest-content">true</boolean>

               <string name="digest-algorithm">sha1</string>

 

            </newObject>

            <newObject name="HTTP">

                <boolean name="enabled">true</boolean>

                <newObject name="HTTP#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">

                       <map>

                       </map>

               </newObject>

                <newObject name="midfetch-decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">

                       <map name="rules">

                       </map> 

               </newObject>

               <integer name="timeout-seconds">1200</integer>

                <integer name="sotimeout-ms">20000</integer>

                <integer name="fetch-bandwidth">0</integer>

                <long name="max-length-bytes">0</long>

                <boolean name="ignore-cookies">false</boolean>

                <boolean name="use-bdb-for-cookies">true</boolean>

                <string name="load-cookies-from-file"/>

                <string name="save-cookies-to-file"/>

                <string name="trust-level">open</string>

                <stringList name="accept-headers">

                </stringList>

                <string/>

                <string name="http-proxy-port"/>

                <string name="default-encoding">ISO-8859-1</string>

               <boolean name="digest-content">true</boolean>

               <string name="digest-algorithm">sha1</string>

               <boolean name="send-if-modified-since">true</boolean>

               <boolean name="send-if-none-match">true</boolean>

                <boolean name="send-connection-close">true</boolean>

                <boolean name="send-referer">true</boolean>

                <boolean>false</boolean>

               <string name="http-bind-address"/>

    </newObject>

        </map> <!-- end of Heritrix Fetch processors -->

 

        <!-- Heritrix extract processors -->

        <map name="extract-processors">

            <newObject class="org.archive.crawler.extractor.ExtractorHTTP">

                <boolean name="enabled">true</boolean>

               <newObject name="ExtractorHTTP#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">

                       <map name="rules">

                       </map>

               </newObject>

            </newObject>

            <newObject name="ExtractorHTML" class="org.archive.crawler.extractor.ExtractorHTML">

                <boolean name="enabled">true</boolean>

               <newObject name="ExtractorHTML#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">

                       <map>

                       </map>

               </newObject>

               <boolean name="extract-javascript">true</boolean>

               <boolean name="treat-frames-as-embed-links">true</boolean>

               <boolean name="ignore-form-action-urls">true</boolean>

               <boolean name="extract-value-attributes">true</boolean>

               <boolean name="ignore-unexpected-html">true</boolean>

            </newObject>

            <newObject name="ExtractorCSS">

                <boolean name="enabled">true</boolean>

               <newObject name="ExtractorCSS#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">

                       <map>

                       </map>

               </newObject>

            </newObject>

            <newObject name="ExtractorJS" class="org.archive.crawler.extractor.ExtractorJS">

                <boolean name="enabled">true</boolean>

               <newObject name="ExtractorJS#decide-rules">

                       <map>

                       </map>

               </newObject>

            </newObject>

            <newObject name="ExtractorSWF" class="org.archive.crawler.extractor.ExtractorSWF">

                <boolean name="enabled">true</boolean>

               <newObject name="ExtractorSWF#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">

                       <map>

                       </map>

               </newObject>

             </newObject>

<newObject>

                <boolean name="enabled">true</boolean>

                <string name="trigger-regexp">(^http.*://.*)//(.*$)</string>

                <string name="build-pattern">$1/$2</string>

                <boolean name="remove-trigger-uris">false</boolean>

            </newObject>

 

        </map> <!-- end of Heritrix extract processors -->

        <!-- Heritrix write processors --> 

        <map name="write-processors">

            <newObject class="is.hi.bok.deduplicator.DeDuplicator">

               <boolean>true</boolean>

               <map>

               </map>

               <string/>

               <string>By URL</string>

               <boolean name="try-equivalent">true</boolean>

               <boolean name="change-content-size">false</boolean>

               <string name="mime-filter">^text/.*</string>

               <string name="filter-mode">Blacklist</string>

               <string>Timestamp</string>

               <string>SEVERE</string>

               <string/>

               <string>Use index information</string>

               <boolean name="stats-per-host">true</boolean>

                <boolean>true</boolean>

           </newObject>

        <newObject name="Archiver" class="org.archive.crawler.writer.ARCWriterProcessor">

                <boolean name="enabled">true</boolean>

               <newObject name="Archiver#decide-rules">

                       <map>

                       </map>

               </newObject>

               <boolean name="compress">false</boolean>

                <string name="prefix">IAH</string>

                <string name="suffix">${HOSTNAME}</string>

                <integer name="max-size-bytes">100000000</integer>

                <stringList name="path">

                    <string>arcs</string>

                </stringList>

                <integer name="pool-max-active">5</integer>

                <integer name="pool-max-wait">300000</integer>

                <long name="total-bytes-to-write">0</long>

               <boolean name="skip-identical-digests">false</boolean>

    </newObject>

 

        </map> <!-- End of Heritrix write processors -->

        <!-- Heritrix post processors -->

        <map name="post-processors">

            <newObject name="Updater" class="org.archive.crawler.postprocessor.CrawlStateUpdater">

                <boolean name="enabled">true</boolean>

               <newObject name="Updater#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">

                       <map>

                       </map>

               </newObject>

            </newObject>

            <newObject name="LinksScoper">

                <boolean name="enabled">true</boolean>

               <newObject name="LinksScoper#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">

                       <map>

                       </map>

               </newObject>

               <boolean name="override-logger">false</boolean>

                <boolean name="seed-redirects-new-seed">true</boolean>

               <integer name="preference-depth-hops">-1</integer>

 

               <newObject name="scope-rejected-url-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">

                       <map>

                       </map>

               </newObject>

            </newObject>

 

        <newObject name="Scheduler" class="org.archive.crawler.postprocessor.FrontierScheduler">

                <boolean name="enabled">true</boolean>

               <newObject name="Scheduler#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">

                       <map>

                       </map>

               </newObject>

        </newObject>

 

        <newObject name="ContentSize">

               <boolean>true</boolean>

               <newObject name="ContentSize#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">

                       <map>

                       </map>

               </newObject>

        </newObject>

 

        </map>  <!-- end of Heritrix post processors -->

 

        <map name="loggers">

            <newObject name="crawl-statistics" class="org.archive.crawler.admin.StatisticsTracker">

                <integer name="interval-seconds">20</integer>

            </newObject>

        </map>

        <string name="recover-path"/>

        <boolean name="checkpoint-copy-bdbje-logs">true</boolean>

        <boolean name="recover-retain-failures">false</boolean>

        <newObject name="credential-store" class="org.archive.crawler.datamodel.CredentialStore">

            <map name="credentials">

            </map>

        </newObject>

    </controller>

</crawl-order>