<?xml version="1.0"?>
<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en">
	<id>https://wiki.apertium.org/w/index.php?action=history&amp;feed=atom&amp;title=Writing_a_scraper</id>
	<title>Writing a scraper - Revision history</title>
	<link rel="self" type="application/atom+xml" href="https://wiki.apertium.org/w/index.php?action=history&amp;feed=atom&amp;title=Writing_a_scraper"/>
	<link rel="alternate" type="text/html" href="https://wiki.apertium.org/w/index.php?title=Writing_a_scraper&amp;action=history"/>
	<updated>2026-04-27T15:30:34Z</updated>
	<subtitle>Revision history for this page on the wiki</subtitle>
	<generator>MediaWiki 1.34.1</generator>
	<entry>
		<id>https://wiki.apertium.org/w/index.php?title=Writing_a_scraper&amp;diff=69108&amp;oldid=prev</id>
		<title>Firespeaker at 05:20, 29 March 2019</title>
		<link rel="alternate" type="text/html" href="https://wiki.apertium.org/w/index.php?title=Writing_a_scraper&amp;diff=69108&amp;oldid=prev"/>
		<updated>2019-03-29T05:20:20Z</updated>

		<summary type="html">&lt;p&gt;&lt;/p&gt;
&lt;table class=&quot;diff diff-contentalign-left&quot; data-mw=&quot;interface&quot;&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;tr class=&quot;diff-title&quot; lang=&quot;en&quot;&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;← Older revision&lt;/td&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;Revision as of 05:20, 29 March 2019&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 1:&lt;/td&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 1:&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;−&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-deletedline diff-side-deleted&quot;&gt;&lt;div&gt;This page outlines how to develop a scraper for &lt;del class=&quot;diffchange diffchange-inline&quot;&gt;apertium&lt;/del&gt; using our &lt;del class=&quot;diffchange diffchange-inline&quot;&gt;RFERL&lt;/del&gt; scraper.  The code can be found &lt;del class=&quot;diffchange diffchange-inline&quot;&gt;in&lt;/del&gt; &lt;del class=&quot;diffchange diffchange-inline&quot;&gt;our subversion repository&lt;/del&gt; at [https://&lt;del class=&quot;diffchange diffchange-inline&quot;&gt;svn&lt;/del&gt;.&lt;del class=&quot;diffchange diffchange-inline&quot;&gt;code.sf.net/p&lt;/del&gt;/apertium&lt;del class=&quot;diffchange diffchange-inline&quot;&gt;/svn/trunk&lt;/del&gt;/apertium-&lt;del class=&quot;diffchange diffchange-inline&quot;&gt;tools/scraper&lt;/del&gt; https://&lt;del class=&quot;diffchange diffchange-inline&quot;&gt;svn&lt;/del&gt;.&lt;del class=&quot;diffchange diffchange-inline&quot;&gt;code.sf.net/p&lt;/del&gt;/apertium&lt;del class=&quot;diffchange diffchange-inline&quot;&gt;/svn/trunk&lt;/del&gt;/apertium-&lt;del class=&quot;diffchange diffchange-inline&quot;&gt;tools/scraper&lt;/del&gt;].&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-addedline diff-side-added&quot;&gt;&lt;div&gt;This page outlines how to develop a scraper&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt; of news websites&lt;/ins&gt; for &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;Apertium&lt;/ins&gt; using our &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;&#039;&#039;&#039;news&lt;/ins&gt; scraper&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;&#039;&#039;&#039;&lt;/ins&gt;.  The code can be found &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;on&lt;/ins&gt; &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;GitHub&lt;/ins&gt; at [https://&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;github&lt;/ins&gt;.&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;com&lt;/ins&gt;/apertium/apertium-&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;news-scrapers&lt;/ins&gt; https://&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;github&lt;/ins&gt;.&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;com&lt;/ins&gt;/apertium/apertium-&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;news-scrapers&lt;/ins&gt;].&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;br /&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;br /&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;== How-To ==&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;== How-To ==&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;/table&gt;</summary>
		<author><name>Firespeaker</name></author>
		
	</entry>
	<entry>
		<id>https://wiki.apertium.org/w/index.php?title=Writing_a_scraper&amp;diff=60196&amp;oldid=prev</id>
		<title>Rcrowther at 16:57, 26 September 2016</title>
		<link rel="alternate" type="text/html" href="https://wiki.apertium.org/w/index.php?title=Writing_a_scraper&amp;diff=60196&amp;oldid=prev"/>
		<updated>2016-09-26T16:57:39Z</updated>

		<summary type="html">&lt;p&gt;&lt;/p&gt;
&lt;table class=&quot;diff diff-contentalign-left&quot; data-mw=&quot;interface&quot;&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;tr class=&quot;diff-title&quot; lang=&quot;en&quot;&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;← Older revision&lt;/td&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;Revision as of 16:57, 26 September 2016&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 137:&lt;/td&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 137:&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;&#039;&#039;&#039;Research:&#039;&#039;&#039; Testing shows that the problem is occurring when two break tags are present on two separate lines and they are directly followed by another tag, generally an &amp;lt;code&amp;gt;em&amp;lt;/code&amp;gt; or a &amp;lt;code&amp;gt;strong&amp;lt;/code&amp;gt;, however the same problem has been observed with other tags. In the case that the break tags are seperated by text, lxml properly handles them. However, in the case that they are not, lxml fails to properly recognize the break tags. [https://pastee.org/ev5dc Test script], [https://pastee.org/pp6h4 Test HTML] &amp;lt;br /&amp;gt;&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;&#039;&#039;&#039;Research:&#039;&#039;&#039; Testing shows that the problem is occurring when two break tags are present on two separate lines and they are directly followed by another tag, generally an &amp;lt;code&amp;gt;em&amp;lt;/code&amp;gt; or a &amp;lt;code&amp;gt;strong&amp;lt;/code&amp;gt;, however the same problem has been observed with other tags. In the case that the break tags are seperated by text, lxml properly handles them. However, in the case that they are not, lxml fails to properly recognize the break tags. [https://pastee.org/ev5dc Test script], [https://pastee.org/pp6h4 Test HTML] &amp;lt;br /&amp;gt;&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;&#039;&#039;&#039;Suggested Solution:&#039;&#039;&#039; Submit a bug report to lxml. We could create [http://lxml.de/element_classes.html custom Element classes]? I&#039;m fairly sure that even if we managed to do that, it would be fairly inelegant. A [https://bugs.launchpad.net/lxml/+bug/1095945 bug report] has been filed. Turns out that the bug was in libxml2 rather than lxml and was addressed in a newer version of libxml2 (check the bug report)&amp;lt;br /&amp;gt;&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;&#039;&#039;&#039;Suggested Solution:&#039;&#039;&#039; Submit a bug report to lxml. We could create [http://lxml.de/element_classes.html custom Element classes]? I&#039;m fairly sure that even if we managed to do that, it would be fairly inelegant. A [https://bugs.launchpad.net/lxml/+bug/1095945 bug report] has been filed. Turns out that the bug was in libxml2 rather than lxml and was addressed in a newer version of libxml2 (check the bug report)&amp;lt;br /&amp;gt;&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-empty diff-side-deleted&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-addedline diff-side-added&quot;&gt;&lt;br /&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-empty diff-side-deleted&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-addedline diff-side-added&quot;&gt;&lt;div&gt;[[Category:Documentation in English]]&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;/table&gt;</summary>
		<author><name>Rcrowther</name></author>
		
	</entry>
	<entry>
		<id>https://wiki.apertium.org/w/index.php?title=Writing_a_scraper&amp;diff=52344&amp;oldid=prev</id>
		<title>Putti: /* Use Scraper class and test */</title>
		<link rel="alternate" type="text/html" href="https://wiki.apertium.org/w/index.php?title=Writing_a_scraper&amp;diff=52344&amp;oldid=prev"/>
		<updated>2014-12-28T22:43:40Z</updated>

		<summary type="html">&lt;p&gt;&lt;span dir=&quot;auto&quot;&gt;&lt;span class=&quot;autocomment&quot;&gt;Use Scraper class and test&lt;/span&gt;&lt;/span&gt;&lt;/p&gt;
&lt;table class=&quot;diff diff-contentalign-left&quot; data-mw=&quot;interface&quot;&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;tr class=&quot;diff-title&quot; lang=&quot;en&quot;&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;← Older revision&lt;/td&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;Revision as of 22:43, 28 December 2014&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 87:&lt;/td&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 87:&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;&amp;lt;code&amp;gt;&amp;lt;pre&amp;gt;&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;&amp;lt;code&amp;gt;&amp;lt;pre&amp;gt;&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;w = Writer()&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;w = Writer()&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-empty diff-side-deleted&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-addedline diff-side-added&quot;&gt;&lt;br /&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-empty diff-side-deleted&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-addedline diff-side-added&quot;&gt;&lt;div&gt;# Create a SIGTERM signal handler in order to save all the article data if kill command is called. &lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-empty diff-side-deleted&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-addedline diff-side-added&quot;&gt;&lt;div&gt;def term_handler(sigNum, frame):&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-empty diff-side-deleted&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-addedline diff-side-added&quot;&gt;&lt;div&gt;	w.close()&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-empty diff-side-deleted&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-addedline diff-side-added&quot;&gt;&lt;div&gt;	sys.exit(0)&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-empty diff-side-deleted&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-addedline diff-side-added&quot;&gt;&lt;br /&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-empty diff-side-deleted&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-addedline diff-side-added&quot;&gt;&lt;div&gt;signal.signal(signal.SIGTERM, term_handler)&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-empty diff-side-deleted&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-addedline diff-side-added&quot;&gt;&lt;br /&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;try:&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;try:&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;	for (title, url, date) in articles:&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;	for (title, url, date) in articles:&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;/table&gt;</summary>
		<author><name>Putti</name></author>
		
	</entry>
	<entry>
		<id>https://wiki.apertium.org/w/index.php?title=Writing_a_scraper&amp;diff=52343&amp;oldid=prev</id>
		<title>Putti: /* Use Scraper class and test */</title>
		<link rel="alternate" type="text/html" href="https://wiki.apertium.org/w/index.php?title=Writing_a_scraper&amp;diff=52343&amp;oldid=prev"/>
		<updated>2014-12-26T21:16:48Z</updated>

		<summary type="html">&lt;p&gt;&lt;span dir=&quot;auto&quot;&gt;&lt;span class=&quot;autocomment&quot;&gt;Use Scraper class and test&lt;/span&gt;&lt;/span&gt;&lt;/p&gt;
&lt;table class=&quot;diff diff-contentalign-left&quot; data-mw=&quot;interface&quot;&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;tr class=&quot;diff-title&quot; lang=&quot;en&quot;&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;← Older revision&lt;/td&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;Revision as of 21:16, 26 December 2014&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 83:&lt;/td&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 83:&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;&amp;lt;li&amp;gt;Catch exceptions that occur during scraping but don&#039;t fail silently. You don&#039;t want a single badly formatted article to stop the entire process.&amp;lt;/li&amp;gt;&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;&amp;lt;li&amp;gt;Catch exceptions that occur during scraping but don&#039;t fail silently. You don&#039;t want a single badly formatted article to stop the entire process.&amp;lt;/li&amp;gt;&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;&amp;lt;li&amp;gt;Remeber to call &amp;lt;code&amp;gt;Writer&amp;lt;/code&amp;gt; object&#039;s &amp;lt;code&amp;gt;close()&amp;lt;/code&amp;gt; function before closing the scraper. It will make sure all the data has been saved correctly.&amp;lt;/li&amp;gt;&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;&amp;lt;li&amp;gt;Remeber to call &amp;lt;code&amp;gt;Writer&amp;lt;/code&amp;gt; object&#039;s &amp;lt;code&amp;gt;close()&amp;lt;/code&amp;gt; function before closing the scraper. It will make sure all the data has been saved correctly.&amp;lt;/li&amp;gt;&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;−&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-deletedline diff-side-deleted&quot;&gt;&lt;div&gt;&amp;lt;li&amp;gt;Put the main code inside a &amp;lt;code&amp;gt;try&amp;lt;/code&amp;gt; structure and &amp;lt;code&amp;gt;expect&amp;lt;/code&amp;gt; a &amp;lt;code&amp;gt;&lt;del class=&quot;diffchange diffchange-inline&quot;&gt;KeyboardInterrup&lt;/del&gt;&amp;lt;/code&amp;gt;, in order to save all scraped data when user does a keyboard interrupt (^C).&amp;lt;/li&amp;gt;&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-addedline diff-side-added&quot;&gt;&lt;div&gt;&amp;lt;li&amp;gt;Put the main code inside a &amp;lt;code&amp;gt;try&amp;lt;/code&amp;gt; structure and &amp;lt;code&amp;gt;expect&amp;lt;/code&amp;gt; a &amp;lt;code&amp;gt;&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;KeyboardInterrupt&lt;/ins&gt;&amp;lt;/code&amp;gt;, in order to save all&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt; the&lt;/ins&gt; scraped data when user does a keyboard interrupt (^C).&amp;lt;/li&amp;gt;&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;&amp;lt;/ul&amp;gt;&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;&amp;lt;/ul&amp;gt;&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;&amp;lt;code&amp;gt;&amp;lt;pre&amp;gt;&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;&amp;lt;code&amp;gt;&amp;lt;pre&amp;gt;&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;/table&gt;</summary>
		<author><name>Putti</name></author>
		
	</entry>
	<entry>
		<id>https://wiki.apertium.org/w/index.php?title=Writing_a_scraper&amp;diff=52342&amp;oldid=prev</id>
		<title>Putti: /* Use Scraper class and test */</title>
		<link rel="alternate" type="text/html" href="https://wiki.apertium.org/w/index.php?title=Writing_a_scraper&amp;diff=52342&amp;oldid=prev"/>
		<updated>2014-12-26T21:16:08Z</updated>

		<summary type="html">&lt;p&gt;&lt;span dir=&quot;auto&quot;&gt;&lt;span class=&quot;autocomment&quot;&gt;Use Scraper class and test&lt;/span&gt;&lt;/span&gt;&lt;/p&gt;
&lt;table class=&quot;diff diff-contentalign-left&quot; data-mw=&quot;interface&quot;&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;tr class=&quot;diff-title&quot; lang=&quot;en&quot;&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;← Older revision&lt;/td&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;Revision as of 21:16, 26 December 2014&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 83:&lt;/td&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 83:&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;&amp;lt;li&amp;gt;Catch exceptions that occur during scraping but don&#039;t fail silently. You don&#039;t want a single badly formatted article to stop the entire process.&amp;lt;/li&amp;gt;&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;&amp;lt;li&amp;gt;Catch exceptions that occur during scraping but don&#039;t fail silently. You don&#039;t want a single badly formatted article to stop the entire process.&amp;lt;/li&amp;gt;&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;&amp;lt;li&amp;gt;Remeber to call &amp;lt;code&amp;gt;Writer&amp;lt;/code&amp;gt; object&#039;s &amp;lt;code&amp;gt;close()&amp;lt;/code&amp;gt; function before closing the scraper. It will make sure all the data has been saved correctly.&amp;lt;/li&amp;gt;&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;&amp;lt;li&amp;gt;Remeber to call &amp;lt;code&amp;gt;Writer&amp;lt;/code&amp;gt; object&#039;s &amp;lt;code&amp;gt;close()&amp;lt;/code&amp;gt; function before closing the scraper. It will make sure all the data has been saved correctly.&amp;lt;/li&amp;gt;&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;−&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-deletedline diff-side-deleted&quot;&gt;&lt;div&gt;&amp;lt;li&amp;gt;Put the main code &lt;del class=&quot;diffchange diffchange-inline&quot;&gt;in side&lt;/del&gt; a &amp;lt;code&amp;gt;try&amp;lt;/code&amp;gt; structure and &amp;lt;code&amp;gt;expect&amp;lt;/code&amp;gt; a &amp;lt;code&amp;gt;KeyboardInterrup&amp;lt;/code&amp;gt;, in order to save all scraped data when user does a keyboard interrupt (^C).&amp;lt;/li&amp;gt;&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-addedline diff-side-added&quot;&gt;&lt;div&gt;&amp;lt;li&amp;gt;Put the main code &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;inside&lt;/ins&gt; a &amp;lt;code&amp;gt;try&amp;lt;/code&amp;gt; structure and &amp;lt;code&amp;gt;expect&amp;lt;/code&amp;gt; a &amp;lt;code&amp;gt;KeyboardInterrup&amp;lt;/code&amp;gt;, in order to save all scraped data when user does a keyboard interrupt (^C).&amp;lt;/li&amp;gt;&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;&amp;lt;/ul&amp;gt;&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;&amp;lt;/ul&amp;gt;&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;&amp;lt;code&amp;gt;&amp;lt;pre&amp;gt;&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;&amp;lt;code&amp;gt;&amp;lt;pre&amp;gt;&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;/table&gt;</summary>
		<author><name>Putti</name></author>
		
	</entry>
	<entry>
		<id>https://wiki.apertium.org/w/index.php?title=Writing_a_scraper&amp;diff=52341&amp;oldid=prev</id>
		<title>Putti: /* Use Scraper class and test */</title>
		<link rel="alternate" type="text/html" href="https://wiki.apertium.org/w/index.php?title=Writing_a_scraper&amp;diff=52341&amp;oldid=prev"/>
		<updated>2014-12-26T21:15:34Z</updated>

		<summary type="html">&lt;p&gt;&lt;span dir=&quot;auto&quot;&gt;&lt;span class=&quot;autocomment&quot;&gt;Use Scraper class and test&lt;/span&gt;&lt;/span&gt;&lt;/p&gt;
&lt;table class=&quot;diff diff-contentalign-left&quot; data-mw=&quot;interface&quot;&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;tr class=&quot;diff-title&quot; lang=&quot;en&quot;&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;← Older revision&lt;/td&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;Revision as of 21:15, 26 December 2014&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 83:&lt;/td&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 83:&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;&amp;lt;li&amp;gt;Catch exceptions that occur during scraping but don&#039;t fail silently. You don&#039;t want a single badly formatted article to stop the entire process.&amp;lt;/li&amp;gt;&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;&amp;lt;li&amp;gt;Catch exceptions that occur during scraping but don&#039;t fail silently. You don&#039;t want a single badly formatted article to stop the entire process.&amp;lt;/li&amp;gt;&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;&amp;lt;li&amp;gt;Remeber to call &amp;lt;code&amp;gt;Writer&amp;lt;/code&amp;gt; object&#039;s &amp;lt;code&amp;gt;close()&amp;lt;/code&amp;gt; function before closing the scraper. It will make sure all the data has been saved correctly.&amp;lt;/li&amp;gt;&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;&amp;lt;li&amp;gt;Remeber to call &amp;lt;code&amp;gt;Writer&amp;lt;/code&amp;gt; object&#039;s &amp;lt;code&amp;gt;close()&amp;lt;/code&amp;gt; function before closing the scraper. It will make sure all the data has been saved correctly.&amp;lt;/li&amp;gt;&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-empty diff-side-deleted&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-addedline diff-side-added&quot;&gt;&lt;div&gt;&amp;lt;li&amp;gt;Put the main code in side a &amp;lt;code&amp;gt;try&amp;lt;/code&amp;gt; structure and &amp;lt;code&amp;gt;expect&amp;lt;/code&amp;gt; a &amp;lt;code&amp;gt;KeyboardInterrup&amp;lt;/code&amp;gt;, in order to save all scraped data when user does a keyboard interrupt (^C).&amp;lt;/li&amp;gt;&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;&amp;lt;/ul&amp;gt;&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;&amp;lt;/ul&amp;gt;&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;&amp;lt;code&amp;gt;&amp;lt;pre&amp;gt;&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;&amp;lt;code&amp;gt;&amp;lt;pre&amp;gt;&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;w = Writer()&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;w = Writer()&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-empty diff-side-deleted&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;a class=&quot;mw-diff-movedpara-right&quot; title=&quot;Paragraph was moved. Click to jump to old location.&quot; href=&quot;#movedpara_6_0_lhs&quot;&gt;&amp;#x26AB;&lt;/a&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-addedline diff-side-added&quot;&gt;&lt;div&gt;&lt;a name=&quot;movedpara_3_0_rhs&quot;&gt;&lt;/a&gt;try:&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;−&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-deletedline diff-side-deleted&quot;&gt;&lt;div&gt;for (title, url, date) in articles:&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-addedline diff-side-added&quot;&gt;&lt;div&gt;&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;	&lt;/ins&gt;for (title, url, date) in articles:&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;a class=&quot;mw-diff-movedpara-left&quot; title=&quot;Paragraph was moved. Click to jump to new location.&quot; href=&quot;#movedpara_3_0_rhs&quot;&gt;&amp;#x26AB;&lt;/a&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-deletedline diff-side-deleted&quot;&gt;&lt;div&gt;&lt;a name=&quot;movedpara_6_0_lhs&quot;&gt;&lt;/a&gt;&lt;del class=&quot;diffchange diffchange-inline&quot;&gt;	&lt;/del&gt;try:&lt;/div&gt;&lt;/td&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-empty diff-side-added&quot;&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-empty diff-side-deleted&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-addedline diff-side-added&quot;&gt;&lt;div&gt;		try:&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;−&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-deletedline diff-side-deleted&quot;&gt;&lt;div&gt;		source = Source(url, title=title, date=date, scraper=ScraperAzadliq, conn=conn) #replace scraper with the one you created earlier&lt;/div&gt;&lt;/td&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-empty diff-side-added&quot;&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;−&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-deletedline diff-side-deleted&quot;&gt;&lt;div&gt;		source&lt;del class=&quot;diffchange diffchange-inline&quot;&gt;.makeRoot&lt;/del&gt;(&lt;del class=&quot;diffchange diffchange-inline&quot;&gt;&quot;./&quot;&lt;/del&gt;, &lt;del class=&quot;diffchange diffchange-inline&quot;&gt;ids&lt;/del&gt;=&lt;del class=&quot;diffchange diffchange-inline&quot;&gt;ids&lt;/del&gt;, &lt;del class=&quot;diffchange diffchange-inline&quot;&gt;root&lt;/del&gt;=&lt;del class=&quot;diffchange diffchange-inline&quot;&gt;root&lt;/del&gt;, &lt;del class=&quot;diffchange diffchange-inline&quot;&gt;lang&lt;/del&gt;=&lt;del class=&quot;diffchange diffchange-inline&quot;&gt;&quot;aze&quot;&lt;/del&gt;) #replace &lt;del class=&quot;diffchange diffchange-inline&quot;&gt;language&lt;/del&gt; with the&lt;del class=&quot;diffchange diffchange-inline&quot;&gt; appropriate&lt;/del&gt; one&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-addedline diff-side-added&quot;&gt;&lt;div&gt;&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;	&lt;/ins&gt;		source&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt; = Source&lt;/ins&gt;(&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;url&lt;/ins&gt;, &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;title&lt;/ins&gt;=&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;title&lt;/ins&gt;, &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;date&lt;/ins&gt;=&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;date&lt;/ins&gt;, &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;scraper&lt;/ins&gt;=&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;ScraperAzadliq, conn=conn&lt;/ins&gt;) #replace &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;scraper&lt;/ins&gt; with the one&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt; you created earlier&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-empty diff-side-deleted&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-addedline diff-side-added&quot;&gt;&lt;div&gt;			source.makeRoot(&quot;./&quot;, ids=ids, root=root, lang=&quot;aze&quot;) #replace language with the appropriate one&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;−&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-deletedline diff-side-deleted&quot;&gt;&lt;div&gt;		source.add_to_archive()&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-addedline diff-side-added&quot;&gt;&lt;div&gt;&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;	&lt;/ins&gt;		source.add_to_archive()&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;−&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-deletedline diff-side-deleted&quot;&gt;&lt;div&gt;		if ids is None:&lt;/div&gt;&lt;/td&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-empty diff-side-added&quot;&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;−&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-deletedline diff-side-deleted&quot;&gt;&lt;div&gt;			ids &lt;del class=&quot;diffchange diffchange-inline&quot;&gt;=&lt;/del&gt; &lt;del class=&quot;diffchange diffchange-inline&quot;&gt;source.ids&lt;/del&gt;&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-addedline diff-side-added&quot;&gt;&lt;div&gt;			&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;if &lt;/ins&gt;ids &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;is&lt;/ins&gt; &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;None:&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-empty diff-side-deleted&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-addedline diff-side-added&quot;&gt;&lt;div&gt;				ids = source.ids&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;−&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-deletedline diff-side-deleted&quot;&gt;&lt;div&gt;		if root is None:&lt;/div&gt;&lt;/td&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-empty diff-side-added&quot;&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;−&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-deletedline diff-side-deleted&quot;&gt;&lt;div&gt;			root &lt;del class=&quot;diffchange diffchange-inline&quot;&gt;=&lt;/del&gt; &lt;del class=&quot;diffchange diffchange-inline&quot;&gt;source.root&lt;/del&gt;&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-addedline diff-side-added&quot;&gt;&lt;div&gt;			&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;if &lt;/ins&gt;root &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;is&lt;/ins&gt; &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;None:&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-empty diff-side-deleted&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-addedline diff-side-added&quot;&gt;&lt;div&gt;				root = source.root&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;−&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-deletedline diff-side-deleted&quot;&gt;&lt;div&gt;&lt;del class=&quot;diffchange diffchange-inline&quot;&gt;	&lt;/del&gt;except Exception as e:&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-addedline diff-side-added&quot;&gt;&lt;div&gt;&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;		&lt;/ins&gt;except Exception as e:&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;−&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-deletedline diff-side-deleted&quot;&gt;&lt;div&gt;		print(url + &quot; &quot; + str(e))&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-addedline diff-side-added&quot;&gt;&lt;div&gt;&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;	&lt;/ins&gt;		print(url + &quot; &quot; + str(e))&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-empty diff-side-deleted&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-addedline diff-side-added&quot;&gt;&lt;div&gt;except KeyboardInterrupt:&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-empty diff-side-deleted&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-addedline diff-side-added&quot;&gt;&lt;div&gt;	print(&quot;\nReceived a keyboard interrupt. Closing the program.&quot;)&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;w.close()&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;w.close()&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;&amp;lt;/pre&amp;gt;&amp;lt;/code&amp;gt;&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;&amp;lt;/pre&amp;gt;&amp;lt;/code&amp;gt;&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;/table&gt;</summary>
		<author><name>Putti</name></author>
		
	</entry>
	<entry>
		<id>https://wiki.apertium.org/w/index.php?title=Writing_a_scraper&amp;diff=51182&amp;oldid=prev</id>
		<title>Putti: /* Use Scraper class and test */</title>
		<link rel="alternate" type="text/html" href="https://wiki.apertium.org/w/index.php?title=Writing_a_scraper&amp;diff=51182&amp;oldid=prev"/>
		<updated>2014-12-08T10:09:05Z</updated>

		<summary type="html">&lt;p&gt;&lt;span dir=&quot;auto&quot;&gt;&lt;span class=&quot;autocomment&quot;&gt;Use Scraper class and test&lt;/span&gt;&lt;/span&gt;&lt;/p&gt;
&lt;table class=&quot;diff diff-contentalign-left&quot; data-mw=&quot;interface&quot;&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;tr class=&quot;diff-title&quot; lang=&quot;en&quot;&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;← Older revision&lt;/td&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;Revision as of 10:09, 8 December 2014&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 79:&lt;/td&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 79:&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;&amp;lt;li&amp;gt;Finally, in the driver script loop through the list of articles and send each article to the &amp;lt;code&amp;gt;Scraper&amp;lt;/code&amp;gt; class you created to fill the corpus with articles. Have a look at the various &amp;lt;code&amp;gt;scrp-*.py&amp;lt;/code&amp;gt; scripts currently available to get a feel for how to use the &amp;lt;code&amp;gt;Scraper&amp;lt;/code&amp;gt; class. The code below demonstrates the basic idea.&amp;lt;/li&amp;gt;&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;&amp;lt;li&amp;gt;Finally, in the driver script loop through the list of articles and send each article to the &amp;lt;code&amp;gt;Scraper&amp;lt;/code&amp;gt; class you created to fill the corpus with articles. Have a look at the various &amp;lt;code&amp;gt;scrp-*.py&amp;lt;/code&amp;gt; scripts currently available to get a feel for how to use the &amp;lt;code&amp;gt;Scraper&amp;lt;/code&amp;gt; class. The code below demonstrates the basic idea.&amp;lt;/li&amp;gt;&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;&amp;lt;ul&amp;gt;&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;&amp;lt;ul&amp;gt;&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-empty diff-side-deleted&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-addedline diff-side-added&quot;&gt;&lt;div&gt;&amp;lt;li&amp;gt;Create a new &amp;lt;code&amp;gt;Writer&amp;lt;/code&amp;gt; object in order to save scraped articles. The default writing interval is 60 seconds. To change the it add the amount of seconds as a parameter when calling &amp;lt;code&amp;gt;Writer()&amp;lt;/code&amp;gt;. For example if we want to write the data every 30 seconds call &amp;lt;code&amp;gt;Writer(30)&amp;lt;/code&amp;gt;.&amp;lt;/li&amp;gt;&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;&amp;lt;li&amp;gt;Make sure to set the correct language code when setting up the &amp;lt;code&amp;gt;Source&amp;lt;/code&amp;gt; class.&amp;lt;/li&amp;gt;&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;&amp;lt;li&amp;gt;Make sure to set the correct language code when setting up the &amp;lt;code&amp;gt;Source&amp;lt;/code&amp;gt; class.&amp;lt;/li&amp;gt;&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;&amp;lt;li&amp;gt;Catch exceptions that occur during scraping but don&#039;t fail silently. You don&#039;t want a single badly formatted article to stop the entire process.&amp;lt;/li&amp;gt;&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;&amp;lt;li&amp;gt;Catch exceptions that occur during scraping but don&#039;t fail silently. You don&#039;t want a single badly formatted article to stop the entire process.&amp;lt;/li&amp;gt;&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-empty diff-side-deleted&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-addedline diff-side-added&quot;&gt;&lt;div&gt;&amp;lt;li&amp;gt;Remeber to call &amp;lt;code&amp;gt;Writer&amp;lt;/code&amp;gt; object&#039;s &amp;lt;code&amp;gt;close()&amp;lt;/code&amp;gt; function before closing the scraper. It will make sure all the data has been saved correctly.&amp;lt;/li&amp;gt;&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;&amp;lt;/ul&amp;gt;&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;&amp;lt;/ul&amp;gt;&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;&amp;lt;code&amp;gt;&amp;lt;pre&amp;gt;&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;&amp;lt;code&amp;gt;&amp;lt;pre&amp;gt;&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-empty diff-side-deleted&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-addedline diff-side-added&quot;&gt;&lt;div&gt;w = Writer()&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;for (title, url, date) in articles:&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;for (title, url, date) in articles:&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;	try:&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;	try:&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 94:&lt;/td&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 97:&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;	except Exception as e:&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;	except Exception as e:&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;		print(url + &quot; &quot; + str(e))&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;		print(url + &quot; &quot; + str(e))&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-empty diff-side-deleted&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-addedline diff-side-added&quot;&gt;&lt;div&gt;w.close()&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;&amp;lt;/pre&amp;gt;&amp;lt;/code&amp;gt;&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;&amp;lt;/pre&amp;gt;&amp;lt;/code&amp;gt;&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;&amp;lt;li&amp;gt;Scrape a sufficient amount of test articles to determine whether there is any extraneous output in the generated corpus (check the XML file created). If you discover that something is wrong, check the &amp;lt;code&amp;gt;scraped()&amp;lt;/code&amp;gt; function again to make sure that you&#039;ve removed all the bad elements.&amp;lt;/li&amp;gt;&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;&amp;lt;li&amp;gt;Scrape a sufficient amount of test articles to determine whether there is any extraneous output in the generated corpus (check the XML file created). If you discover that something is wrong, check the &amp;lt;code&amp;gt;scraped()&amp;lt;/code&amp;gt; function again to make sure that you&#039;ve removed all the bad elements.&amp;lt;/li&amp;gt;&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;/table&gt;</summary>
		<author><name>Putti</name></author>
		
	</entry>
	<entry>
		<id>https://wiki.apertium.org/w/index.php?title=Writing_a_scraper&amp;diff=45020&amp;oldid=prev</id>
		<title>Sushain at 21:53, 20 November 2013</title>
		<link rel="alternate" type="text/html" href="https://wiki.apertium.org/w/index.php?title=Writing_a_scraper&amp;diff=45020&amp;oldid=prev"/>
		<updated>2013-11-20T21:53:01Z</updated>

		<summary type="html">&lt;p&gt;&lt;/p&gt;
&lt;table class=&quot;diff diff-contentalign-left&quot; data-mw=&quot;interface&quot;&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;tr class=&quot;diff-title&quot; lang=&quot;en&quot;&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;← Older revision&lt;/td&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;Revision as of 21:53, 20 November 2013&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 1:&lt;/td&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 1:&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;This page outlines how to develop a scraper for apertium using our RFERL scraper.  The code can be found in our subversion repository at [https://svn.code.sf.net/p/apertium/svn/trunk/apertium-tools/scraper https://svn.code.sf.net/p/apertium/svn/trunk/apertium-tools/scraper].&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;This page outlines how to develop a scraper for apertium using our RFERL scraper.  The code can be found in our subversion repository at [https://svn.code.sf.net/p/apertium/svn/trunk/apertium-tools/scraper https://svn.code.sf.net/p/apertium/svn/trunk/apertium-tools/scraper].&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;br /&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;br /&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;−&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-deletedline diff-side-deleted&quot;&gt;&lt;div&gt;== &lt;del class=&quot;diffchange diffchange-inline&quot;&gt;Outline&lt;/del&gt; ==&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-addedline diff-side-added&quot;&gt;&lt;div&gt;== &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;How-To&lt;/ins&gt; ==&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;br /&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;br /&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;===Get to know the website===&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;===Get to know the website===&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;/table&gt;</summary>
		<author><name>Sushain</name></author>
		
	</entry>
	<entry>
		<id>https://wiki.apertium.org/w/index.php?title=Writing_a_scraper&amp;diff=45007&amp;oldid=prev</id>
		<title>Sushain: /* RFERL */</title>
		<link rel="alternate" type="text/html" href="https://wiki.apertium.org/w/index.php?title=Writing_a_scraper&amp;diff=45007&amp;oldid=prev"/>
		<updated>2013-11-20T18:18:18Z</updated>

		<summary type="html">&lt;p&gt;&lt;span dir=&quot;auto&quot;&gt;&lt;span class=&quot;autocomment&quot;&gt;RFERL&lt;/span&gt;&lt;/span&gt;&lt;/p&gt;
&lt;table class=&quot;diff diff-contentalign-left&quot; data-mw=&quot;interface&quot;&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;tr class=&quot;diff-title&quot; lang=&quot;en&quot;&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;← Older revision&lt;/td&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;Revision as of 18:18, 20 November 2013&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 111:&lt;/td&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 111:&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;* &quot;20131120&quot; indicates that the publication date of the articles displayed will be &quot;11/20/2013&quot;. Varying this part of the URL in your scraping program will allow you to scrape a given date range.&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;* &quot;20131120&quot; indicates that the publication date of the articles displayed will be &quot;11/20/2013&quot;. Varying this part of the URL in your scraping program will allow you to scrape a given date range.&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;* &quot;330&quot; is the identifier this website uses for the &quot;news&quot; category. In this case it&#039;s duplicated; however, this might not always be the case.&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;* &quot;330&quot; is the identifier this website uses for the &quot;news&quot; category. In this case it&#039;s duplicated; however, this might not always be the case.&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;−&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-deletedline diff-side-deleted&quot;&gt;&lt;div&gt;* &quot;25174069&quot; is&lt;/div&gt;&lt;/td&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-empty diff-side-added&quot;&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;br /&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;br /&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;== Issues with newlines ==&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;== Issues with newlines ==&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;/table&gt;</summary>
		<author><name>Sushain</name></author>
		
	</entry>
	<entry>
		<id>https://wiki.apertium.org/w/index.php?title=Writing_a_scraper&amp;diff=45006&amp;oldid=prev</id>
		<title>Sushain: /* RFERL */</title>
		<link rel="alternate" type="text/html" href="https://wiki.apertium.org/w/index.php?title=Writing_a_scraper&amp;diff=45006&amp;oldid=prev"/>
		<updated>2013-11-20T18:12:05Z</updated>

		<summary type="html">&lt;p&gt;&lt;span dir=&quot;auto&quot;&gt;&lt;span class=&quot;autocomment&quot;&gt;RFERL&lt;/span&gt;&lt;/span&gt;&lt;/p&gt;
&lt;table class=&quot;diff diff-contentalign-left&quot; data-mw=&quot;interface&quot;&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;tr class=&quot;diff-title&quot; lang=&quot;en&quot;&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;← Older revision&lt;/td&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;Revision as of 18:12, 20 November 2013&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 105:&lt;/td&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 105:&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;If you are scraping [[RFERL_corpora|RFERL content]], you will need category names and numbers of only the real content categories (e.g. news, politics).&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;If you are scraping [[RFERL_corpora|RFERL content]], you will need category names and numbers of only the real content categories (e.g. news, politics).&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;br /&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;br /&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;−&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-deletedline diff-side-deleted&quot;&gt;&lt;div&gt;Consider the example URL &quot;http://example.com/archive/news/20131120/330/330.html&lt;del class=&quot;diffchange diffchange-inline&quot;&gt;?id=25174069&lt;/del&gt;&quot; which often resembles the URL structure in RFERL websites. This URL yields a page with a list of articles you can scrape. Following is a summary of the URL&#039;s parts:&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-addedline diff-side-added&quot;&gt;&lt;div&gt;Consider the example URL &quot;http://example.com/archive/news/20131120/330/330.html&quot; which often resembles the URL structure in RFERL websites. This URL yields a page with a list of articles you can scrape. Following is a summary of the URL&#039;s parts:&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;br /&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;br /&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;* &quot;archive&quot; indicates that you are exploring the site&#039;s archive, the goal of scraping.&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;* &quot;archive&quot; indicates that you are exploring the site&#039;s archive, the goal of scraping.&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 111:&lt;/td&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 111:&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;* &quot;20131120&quot; indicates that the publication date of the articles displayed will be &quot;11/20/2013&quot;. Varying this part of the URL in your scraping program will allow you to scrape a given date range.&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;* &quot;20131120&quot; indicates that the publication date of the articles displayed will be &quot;11/20/2013&quot;. Varying this part of the URL in your scraping program will allow you to scrape a given date range.&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;* &quot;330&quot; is the identifier this website uses for the &quot;news&quot; category. In this case it&#039;s duplicated; however, this might not always be the case.&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;* &quot;330&quot; is the identifier this website uses for the &quot;news&quot; category. In this case it&#039;s duplicated; however, this might not always be the case.&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-empty diff-side-deleted&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-addedline diff-side-added&quot;&gt;&lt;div&gt;* &quot;25174069&quot; is&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;br /&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;br /&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;== Issues with newlines ==&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;== Issues with newlines ==&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;/table&gt;</summary>
		<author><name>Sushain</name></author>
		
	</entry>
</feed>