<?xml version="1.0"?>
<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en">
	<id>https://wiki.apertium.org/w/index.php?action=history&amp;feed=atom&amp;title=Wikipedia_Extractor</id>
	<title>Wikipedia Extractor - Revision history</title>
	<link rel="self" type="application/atom+xml" href="https://wiki.apertium.org/w/index.php?action=history&amp;feed=atom&amp;title=Wikipedia_Extractor"/>
	<link rel="alternate" type="text/html" href="https://wiki.apertium.org/w/index.php?title=Wikipedia_Extractor&amp;action=history"/>
	<updated>2026-05-05T19:26:22Z</updated>
	<subtitle>Revision history for this page on the wiki</subtitle>
	<generator>MediaWiki 1.34.1</generator>
	<entry>
		<id>https://wiki.apertium.org/w/index.php?title=Wikipedia_Extractor&amp;diff=74186&amp;oldid=prev</id>
		<title>Firespeaker at 18:55, 30 January 2023</title>
		<link rel="alternate" type="text/html" href="https://wiki.apertium.org/w/index.php?title=Wikipedia_Extractor&amp;diff=74186&amp;oldid=prev"/>
		<updated>2023-01-30T18:55:09Z</updated>

		<summary type="html">&lt;p&gt;&lt;/p&gt;
&lt;table class=&quot;diff diff-contentalign-left&quot; data-mw=&quot;interface&quot;&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;tr class=&quot;diff-title&quot; lang=&quot;en&quot;&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;← Older revision&lt;/td&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;Revision as of 18:55, 30 January 2023&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 2:&lt;/td&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 2:&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;== Goal ==&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;== Goal ==&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;br /&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;br /&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;−&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-deletedline diff-side-deleted&quot;&gt;&lt;div&gt;This tool extracts main text from xml [[Wikipedia dump]] files (at &lt;del class=&quot;diffchange diffchange-inline&quot;&gt;http&lt;/del&gt;://dumps.wikimedia.org/backup-index.html, ideally the &quot;pages-articles&quot; file), producing a text corpus, which is useful for training unsupervised part-of-speech taggers, n-gram language models, etc.&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-addedline diff-side-added&quot;&gt;&lt;div&gt;This tool extracts main text from xml [[Wikipedia dump]] files (at &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;https&lt;/ins&gt;://dumps.wikimedia.org/backup-index.html, ideally the &quot;&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;&#039;&#039;&#039;&lt;/ins&gt;pages-articles&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;.xml.bz2&#039;&#039;&#039;&lt;/ins&gt;&quot; file), producing a text corpus, which is useful for training unsupervised part-of-speech taggers, n-gram language models, etc.&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;br /&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;br /&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;It was modified by a number of people, including by BenStobaugh during Google Code-In 2013, and can be cloned from GitHub at [https://github.com/apertium/WikiExtractor https://github.com/apertium/WikiExtractor].&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;It was modified by a number of people, including by BenStobaugh during Google Code-In 2013, and can be cloned from GitHub at [https://github.com/apertium/WikiExtractor https://github.com/apertium/WikiExtractor].&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;/table&gt;</summary>
		<author><name>Firespeaker</name></author>
		
	</entry>
	<entry>
		<id>https://wiki.apertium.org/w/index.php?title=Wikipedia_Extractor&amp;diff=74185&amp;oldid=prev</id>
		<title>Firespeaker: /* Steps */</title>
		<link rel="alternate" type="text/html" href="https://wiki.apertium.org/w/index.php?title=Wikipedia_Extractor&amp;diff=74185&amp;oldid=prev"/>
		<updated>2023-01-30T18:53:22Z</updated>

		<summary type="html">&lt;p&gt;&lt;span dir=&quot;auto&quot;&gt;&lt;span class=&quot;autocomment&quot;&gt;Steps&lt;/span&gt;&lt;/span&gt;&lt;/p&gt;
&lt;table class=&quot;diff diff-contentalign-left&quot; data-mw=&quot;interface&quot;&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;tr class=&quot;diff-title&quot; lang=&quot;en&quot;&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;← Older revision&lt;/td&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;Revision as of 18:53, 30 January 2023&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 27:&lt;/td&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 27:&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;#: &amp;lt;code&amp;gt;$ python3 WikiExtractor.py --infn xyzwiki-20230120-pages-articles.xml.bz2 --compress&amp;lt;/code&amp;gt;&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;#: &amp;lt;code&amp;gt;$ python3 WikiExtractor.py --infn xyzwiki-20230120-pages-articles.xml.bz2 --compress&amp;lt;/code&amp;gt;&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;br /&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;br /&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;−&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-deletedline diff-side-deleted&quot;&gt;&lt;div&gt;This will output a file called &amp;lt;code&amp;gt;wiki.txt.bz2&amp;lt;/code&amp;gt;.  You will probably want to rename it to something like &amp;lt;code&amp;gt;xyz.wikipedia.&lt;del class=&quot;diffchange diffchange-inline&quot;&gt;20210620&lt;/del&gt;.txt.bz2&amp;lt;/code&amp;gt;.&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-addedline diff-side-added&quot;&gt;&lt;div&gt;This will output a file called &amp;lt;code&amp;gt;wiki.txt.bz2&amp;lt;/code&amp;gt;.  You will probably want to rename it to something like &amp;lt;code&amp;gt;xyz.wikipedia.&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;20230120&lt;/ins&gt;.txt.bz2&amp;lt;/code&amp;gt;.&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;br /&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;br /&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;==See also==&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;==See also==&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;/table&gt;</summary>
		<author><name>Firespeaker</name></author>
		
	</entry>
	<entry>
		<id>https://wiki.apertium.org/w/index.php?title=Wikipedia_Extractor&amp;diff=74184&amp;oldid=prev</id>
		<title>Firespeaker at 18:53, 30 January 2023</title>
		<link rel="alternate" type="text/html" href="https://wiki.apertium.org/w/index.php?title=Wikipedia_Extractor&amp;diff=74184&amp;oldid=prev"/>
		<updated>2023-01-30T18:53:06Z</updated>

		<summary type="html">&lt;p&gt;&lt;/p&gt;
&lt;table class=&quot;diff diff-contentalign-left&quot; data-mw=&quot;interface&quot;&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;tr class=&quot;diff-title&quot; lang=&quot;en&quot;&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;← Older revision&lt;/td&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;Revision as of 18:53, 30 January 2023&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 23:&lt;/td&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 23:&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;#: &amp;lt;code&amp;gt;$ wget https://raw.githubusercontent.com/apertium/WikiExtractor/master/WikiExtractor.py&amp;lt;/code&amp;gt;&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;#: &amp;lt;code&amp;gt;$ wget https://raw.githubusercontent.com/apertium/WikiExtractor/master/WikiExtractor.py&amp;lt;/code&amp;gt;&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;# Download the Wikipedia dump for the language in question from http://dumps.wikimedia.org/backup-index.html.  The below uses a hypothetical file name for a language with the &amp;lt;tt&amp;gt;xyz&amp;lt;/tt&amp;gt; code:&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;# Download the Wikipedia dump for the language in question from http://dumps.wikimedia.org/backup-index.html.  The below uses a hypothetical file name for a language with the &amp;lt;tt&amp;gt;xyz&amp;lt;/tt&amp;gt; code:&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;−&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-deletedline diff-side-deleted&quot;&gt;&lt;div&gt;#: &amp;lt;code&amp;gt;$ wget https://dumps.wikimedia.org/xyzwiki/&lt;del class=&quot;diffchange diffchange-inline&quot;&gt;20210620&lt;/del&gt;/xyzwiki-20230120-pages-articles.xml.bz2&amp;lt;/code&amp;gt;&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-addedline diff-side-added&quot;&gt;&lt;div&gt;#: &amp;lt;code&amp;gt;$ wget https://dumps.wikimedia.org/xyzwiki/&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;20230120&lt;/ins&gt;/xyzwiki-20230120-pages-articles.xml.bz2&amp;lt;/code&amp;gt;&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;# Run the script on the Wikipedia dump file:&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;# Run the script on the Wikipedia dump file:&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;−&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-deletedline diff-side-deleted&quot;&gt;&lt;div&gt;#: &amp;lt;code&amp;gt;$ python3 WikiExtractor.py --infn xyzwiki-&lt;del class=&quot;diffchange diffchange-inline&quot;&gt;20210620&lt;/del&gt;-pages-articles.xml.bz2 --compress&amp;lt;/code&amp;gt;&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-addedline diff-side-added&quot;&gt;&lt;div&gt;#: &amp;lt;code&amp;gt;$ python3 WikiExtractor.py --infn xyzwiki-&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;20230120&lt;/ins&gt;-pages-articles.xml.bz2 --compress&amp;lt;/code&amp;gt;&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;br /&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;br /&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;This will output a file called &amp;lt;code&amp;gt;wiki.txt.bz2&amp;lt;/code&amp;gt;.  You will probably want to rename it to something like &amp;lt;code&amp;gt;xyz.wikipedia.20210620.txt.bz2&amp;lt;/code&amp;gt;.&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;This will output a file called &amp;lt;code&amp;gt;wiki.txt.bz2&amp;lt;/code&amp;gt;.  You will probably want to rename it to something like &amp;lt;code&amp;gt;xyz.wikipedia.20210620.txt.bz2&amp;lt;/code&amp;gt;.&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;/table&gt;</summary>
		<author><name>Firespeaker</name></author>
		
	</entry>
	<entry>
		<id>https://wiki.apertium.org/w/index.php?title=Wikipedia_Extractor&amp;diff=74183&amp;oldid=prev</id>
		<title>Firespeaker at 18:52, 30 January 2023</title>
		<link rel="alternate" type="text/html" href="https://wiki.apertium.org/w/index.php?title=Wikipedia_Extractor&amp;diff=74183&amp;oldid=prev"/>
		<updated>2023-01-30T18:52:42Z</updated>

		<summary type="html">&lt;p&gt;&lt;/p&gt;
&lt;table class=&quot;diff diff-contentalign-left&quot; data-mw=&quot;interface&quot;&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;tr class=&quot;diff-title&quot; lang=&quot;en&quot;&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;← Older revision&lt;/td&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;Revision as of 18:52, 30 January 2023&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 23:&lt;/td&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 23:&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;#: &amp;lt;code&amp;gt;$ wget https://raw.githubusercontent.com/apertium/WikiExtractor/master/WikiExtractor.py&amp;lt;/code&amp;gt;&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;#: &amp;lt;code&amp;gt;$ wget https://raw.githubusercontent.com/apertium/WikiExtractor/master/WikiExtractor.py&amp;lt;/code&amp;gt;&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;# Download the Wikipedia dump for the language in question from http://dumps.wikimedia.org/backup-index.html.  The below uses a hypothetical file name for a language with the &amp;lt;tt&amp;gt;xyz&amp;lt;/tt&amp;gt; code:&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;# Download the Wikipedia dump for the language in question from http://dumps.wikimedia.org/backup-index.html.  The below uses a hypothetical file name for a language with the &amp;lt;tt&amp;gt;xyz&amp;lt;/tt&amp;gt; code:&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;−&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-deletedline diff-side-deleted&quot;&gt;&lt;div&gt;#: &amp;lt;code&amp;gt;$ wget https://dumps.wikimedia.org/xyzwiki/20210620/xyzwiki-&lt;del class=&quot;diffchange diffchange-inline&quot;&gt;20210620&lt;/del&gt;-pages-articles.xml.bz2&amp;lt;/code&amp;gt;&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-addedline diff-side-added&quot;&gt;&lt;div&gt;#: &amp;lt;code&amp;gt;$ wget https://dumps.wikimedia.org/xyzwiki/20210620/xyzwiki-&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;20230120&lt;/ins&gt;-pages-articles.xml.bz2&amp;lt;/code&amp;gt;&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;# Run the script on the Wikipedia dump file:&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;# Run the script on the Wikipedia dump file:&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;#: &amp;lt;code&amp;gt;$ python3 WikiExtractor.py --infn xyzwiki-20210620-pages-articles.xml.bz2 --compress&amp;lt;/code&amp;gt;&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;#: &amp;lt;code&amp;gt;$ python3 WikiExtractor.py --infn xyzwiki-20210620-pages-articles.xml.bz2 --compress&amp;lt;/code&amp;gt;&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;/table&gt;</summary>
		<author><name>Firespeaker</name></author>
		
	</entry>
	<entry>
		<id>https://wiki.apertium.org/w/index.php?title=Wikipedia_Extractor&amp;diff=73727&amp;oldid=prev</id>
		<title>Francis Tyers: /* Steps */</title>
		<link rel="alternate" type="text/html" href="https://wiki.apertium.org/w/index.php?title=Wikipedia_Extractor&amp;diff=73727&amp;oldid=prev"/>
		<updated>2021-08-25T15:51:47Z</updated>

		<summary type="html">&lt;p&gt;&lt;span dir=&quot;auto&quot;&gt;&lt;span class=&quot;autocomment&quot;&gt;Steps&lt;/span&gt;&lt;/span&gt;&lt;/p&gt;
&lt;table class=&quot;diff diff-contentalign-left&quot; data-mw=&quot;interface&quot;&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;tr class=&quot;diff-title&quot; lang=&quot;en&quot;&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;← Older revision&lt;/td&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;Revision as of 15:51, 25 August 2021&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 22:&lt;/td&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 22:&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;# Get the &amp;lt;code&amp;gt;WikiExtractor.py&amp;lt;/code&amp;gt; script from https://github.com/apertium/WikiExtractor:&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;# Get the &amp;lt;code&amp;gt;WikiExtractor.py&amp;lt;/code&amp;gt; script from https://github.com/apertium/WikiExtractor:&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;#: &amp;lt;code&amp;gt;$ wget https://raw.githubusercontent.com/apertium/WikiExtractor/master/WikiExtractor.py&amp;lt;/code&amp;gt;&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;#: &amp;lt;code&amp;gt;$ wget https://raw.githubusercontent.com/apertium/WikiExtractor/master/WikiExtractor.py&amp;lt;/code&amp;gt;&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;−&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-deletedline diff-side-deleted&quot;&gt;&lt;div&gt;# Download the Wikipedia dump for the language in question from http://dumps.wikimedia.org/backup-index.html.  The below uses a hypothetical file name:&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-addedline diff-side-added&quot;&gt;&lt;div&gt;# Download the Wikipedia dump for the language in question from http://dumps.wikimedia.org/backup-index.html.  The below uses a hypothetical file name&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt; for a language with the &amp;lt;tt&amp;gt;xyz&amp;lt;/tt&amp;gt; code&lt;/ins&gt;:&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;#: &amp;lt;code&amp;gt;$ wget https://dumps.wikimedia.org/xyzwiki/20210620/xyzwiki-20210620-pages-articles.xml.bz2&amp;lt;/code&amp;gt;&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;#: &amp;lt;code&amp;gt;$ wget https://dumps.wikimedia.org/xyzwiki/20210620/xyzwiki-20210620-pages-articles.xml.bz2&amp;lt;/code&amp;gt;&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;# Run the script on the Wikipedia dump file:&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;# Run the script on the Wikipedia dump file:&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;/table&gt;</summary>
		<author><name>Francis Tyers</name></author>
		
	</entry>
	<entry>
		<id>https://wiki.apertium.org/w/index.php?title=Wikipedia_Extractor&amp;diff=73554&amp;oldid=prev</id>
		<title>Firespeaker: /* Steps */</title>
		<link rel="alternate" type="text/html" href="https://wiki.apertium.org/w/index.php?title=Wikipedia_Extractor&amp;diff=73554&amp;oldid=prev"/>
		<updated>2021-06-30T19:00:26Z</updated>

		<summary type="html">&lt;p&gt;&lt;span dir=&quot;auto&quot;&gt;&lt;span class=&quot;autocomment&quot;&gt;Steps&lt;/span&gt;&lt;/span&gt;&lt;/p&gt;
&lt;table class=&quot;diff diff-contentalign-left&quot; data-mw=&quot;interface&quot;&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;tr class=&quot;diff-title&quot; lang=&quot;en&quot;&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;← Older revision&lt;/td&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;Revision as of 19:00, 30 June 2021&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 17:&lt;/td&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 17:&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;br /&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;br /&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;== Steps ==&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;== Steps ==&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-empty diff-side-deleted&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-addedline diff-side-added&quot;&gt;&lt;br /&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-empty diff-side-deleted&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-addedline diff-side-added&quot;&gt;&lt;div&gt;Here&#039;s a simple step-by-step guide to the above.&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;br /&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;br /&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;# Get the &amp;lt;code&amp;gt;WikiExtractor.py&amp;lt;/code&amp;gt; script from https://github.com/apertium/WikiExtractor:&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;# Get the &amp;lt;code&amp;gt;WikiExtractor.py&amp;lt;/code&amp;gt; script from https://github.com/apertium/WikiExtractor:&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;/table&gt;</summary>
		<author><name>Firespeaker</name></author>
		
	</entry>
	<entry>
		<id>https://wiki.apertium.org/w/index.php?title=Wikipedia_Extractor&amp;diff=73553&amp;oldid=prev</id>
		<title>Firespeaker at 19:00, 30 June 2021</title>
		<link rel="alternate" type="text/html" href="https://wiki.apertium.org/w/index.php?title=Wikipedia_Extractor&amp;diff=73553&amp;oldid=prev"/>
		<updated>2021-06-30T19:00:09Z</updated>

		<summary type="html">&lt;p&gt;&lt;/p&gt;
&lt;table class=&quot;diff diff-contentalign-left&quot; data-mw=&quot;interface&quot;&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;tr class=&quot;diff-title&quot; lang=&quot;en&quot;&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;← Older revision&lt;/td&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;Revision as of 19:00, 30 June 2021&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 15:&lt;/td&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 15:&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;br /&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;br /&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;You can also run &amp;lt;code&amp;gt;python3 WikiExtractor.py --help&amp;lt;/code&amp;gt; to get more details.&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;You can also run &amp;lt;code&amp;gt;python3 WikiExtractor.py --help&amp;lt;/code&amp;gt; to get more details.&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-empty diff-side-deleted&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-addedline diff-side-added&quot;&gt;&lt;br /&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-empty diff-side-deleted&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-addedline diff-side-added&quot;&gt;&lt;div&gt;== Steps ==&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-empty diff-side-deleted&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-addedline diff-side-added&quot;&gt;&lt;br /&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-empty diff-side-deleted&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-addedline diff-side-added&quot;&gt;&lt;div&gt;# Get the &amp;lt;code&amp;gt;WikiExtractor.py&amp;lt;/code&amp;gt; script from https://github.com/apertium/WikiExtractor:&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-empty diff-side-deleted&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-addedline diff-side-added&quot;&gt;&lt;div&gt;#: &amp;lt;code&amp;gt;$ wget https://raw.githubusercontent.com/apertium/WikiExtractor/master/WikiExtractor.py&amp;lt;/code&amp;gt;&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-empty diff-side-deleted&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-addedline diff-side-added&quot;&gt;&lt;div&gt;# Download the Wikipedia dump for the language in question from http://dumps.wikimedia.org/backup-index.html.  The below uses a hypothetical file name:&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-empty diff-side-deleted&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-addedline diff-side-added&quot;&gt;&lt;div&gt;#: &amp;lt;code&amp;gt;$ wget https://dumps.wikimedia.org/xyzwiki/20210620/xyzwiki-20210620-pages-articles.xml.bz2&amp;lt;/code&amp;gt;&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-empty diff-side-deleted&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-addedline diff-side-added&quot;&gt;&lt;div&gt;# Run the script on the Wikipedia dump file:&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-empty diff-side-deleted&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-addedline diff-side-added&quot;&gt;&lt;div&gt;#: &amp;lt;code&amp;gt;$ python3 WikiExtractor.py --infn xyzwiki-20210620-pages-articles.xml.bz2 --compress&amp;lt;/code&amp;gt;&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-empty diff-side-deleted&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-addedline diff-side-added&quot;&gt;&lt;br /&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-empty diff-side-deleted&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-addedline diff-side-added&quot;&gt;&lt;div&gt;This will output a file called &amp;lt;code&amp;gt;wiki.txt.bz2&amp;lt;/code&amp;gt;.  You will probably want to rename it to something like &amp;lt;code&amp;gt;xyz.wikipedia.20210620.txt.bz2&amp;lt;/code&amp;gt;.&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;br /&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;br /&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;==See also==&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;==See also==&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;/table&gt;</summary>
		<author><name>Firespeaker</name></author>
		
	</entry>
	<entry>
		<id>https://wiki.apertium.org/w/index.php?title=Wikipedia_Extractor&amp;diff=72332&amp;oldid=prev</id>
		<title>Firespeaker at 21:37, 4 June 2020</title>
		<link rel="alternate" type="text/html" href="https://wiki.apertium.org/w/index.php?title=Wikipedia_Extractor&amp;diff=72332&amp;oldid=prev"/>
		<updated>2020-06-04T21:37:28Z</updated>

		<summary type="html">&lt;p&gt;&lt;/p&gt;
&lt;table class=&quot;diff diff-contentalign-left&quot; data-mw=&quot;interface&quot;&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;tr class=&quot;diff-title&quot; lang=&quot;en&quot;&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;← Older revision&lt;/td&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;Revision as of 21:37, 4 June 2020&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 4:&lt;/td&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 4:&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;This tool extracts main text from xml [[Wikipedia dump]] files (at http://dumps.wikimedia.org/backup-index.html, ideally the &quot;pages-articles&quot; file), producing a text corpus, which is useful for training unsupervised part-of-speech taggers, n-gram language models, etc.&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;This tool extracts main text from xml [[Wikipedia dump]] files (at http://dumps.wikimedia.org/backup-index.html, ideally the &quot;pages-articles&quot; file), producing a text corpus, which is useful for training unsupervised part-of-speech taggers, n-gram language models, etc.&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;br /&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;br /&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;−&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-deletedline diff-side-deleted&quot;&gt;&lt;div&gt;It was &lt;del class=&quot;diffchange diffchange-inline&quot;&gt;written&lt;/del&gt; by BenStobaugh during &lt;del class=&quot;diffchange diffchange-inline&quot;&gt;GCI&lt;/del&gt;-2013 and can be &lt;del class=&quot;diffchange diffchange-inline&quot;&gt;downloaded&lt;/del&gt; from &lt;del class=&quot;diffchange diffchange-inline&quot;&gt;SVN&lt;/del&gt; at [https://&lt;del class=&quot;diffchange diffchange-inline&quot;&gt;svn&lt;/del&gt;.&lt;del class=&quot;diffchange diffchange-inline&quot;&gt;code.sf.net/p&lt;/del&gt;/apertium&lt;del class=&quot;diffchange diffchange-inline&quot;&gt;/svn/trunk/apertium-tools&lt;/del&gt;/WikiExtractor&lt;del class=&quot;diffchange diffchange-inline&quot;&gt;.py&lt;/del&gt; https://&lt;del class=&quot;diffchange diffchange-inline&quot;&gt;svn&lt;/del&gt;.&lt;del class=&quot;diffchange diffchange-inline&quot;&gt;code.sf.net/p&lt;/del&gt;/apertium&lt;del class=&quot;diffchange diffchange-inline&quot;&gt;/svn/trunk/apertium-tools&lt;/del&gt;/WikiExtractor.&lt;del class=&quot;diffchange diffchange-inline&quot;&gt;py]&lt;/del&gt;&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-addedline diff-side-added&quot;&gt;&lt;div&gt;It was &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;modified by a number of people, including&lt;/ins&gt; by BenStobaugh during &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;Google Code&lt;/ins&gt;-&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;In &lt;/ins&gt;2013&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;,&lt;/ins&gt; and can be &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;cloned&lt;/ins&gt; from &lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;GitHub&lt;/ins&gt; at [https://&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;github&lt;/ins&gt;.&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;com&lt;/ins&gt;/apertium/WikiExtractor https://&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;github&lt;/ins&gt;.&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;com&lt;/ins&gt;/apertium/WikiExtractor&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt;]&lt;/ins&gt;.&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;br /&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;br /&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;This version is much simpler than the old version. This version auto-removes any formatting and only outputs the text to one file. To use it, simply use the following command in your terminal, where dump.xml is the Wikipedia dump&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;This version is much simpler than the old version. This version auto-removes any formatting and only outputs the text to one file. To use it, simply use the following command in your terminal, where dump.xml is the Wikipedia dump&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;/table&gt;</summary>
		<author><name>Firespeaker</name></author>
		
	</entry>
	<entry>
		<id>https://wiki.apertium.org/w/index.php?title=Wikipedia_Extractor&amp;diff=64440&amp;oldid=prev</id>
		<title>Francis Tyers at 08:06, 13 September 2017</title>
		<link rel="alternate" type="text/html" href="https://wiki.apertium.org/w/index.php?title=Wikipedia_Extractor&amp;diff=64440&amp;oldid=prev"/>
		<updated>2017-09-13T08:06:12Z</updated>

		<summary type="html">&lt;p&gt;&lt;/p&gt;
&lt;table class=&quot;diff diff-contentalign-left&quot; data-mw=&quot;interface&quot;&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;tr class=&quot;diff-title&quot; lang=&quot;en&quot;&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;← Older revision&lt;/td&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;Revision as of 08:06, 13 September 2017&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 9:&lt;/td&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 9:&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;br /&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;br /&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt; $ python3 WikiExtractor.py --infn dump.xml.bz2&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt; $ python3 WikiExtractor.py --infn dump.xml.bz2&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-empty diff-side-deleted&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-addedline diff-side-added&quot;&gt;&lt;br /&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-empty diff-side-deleted&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-addedline diff-side-added&quot;&gt;&lt;div&gt;(Note: If you are on a Mac, make sure that &amp;lt;tt&amp;gt;--&amp;lt;/tt&amp;gt; is really two hyphens and not an em-dash like this: &amp;amp;mdash;).&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;br /&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;br /&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;This will run through all of the articles, get all of the text and put it in wiki.txt. This version also supports compression (BZip2 and Gzip), so you can use &amp;lt;code&amp;gt;dump.xml.bz2&amp;lt;/code&amp;gt; or &amp;lt;code&amp;gt;dump.xml.gz&amp;lt;/code&amp;gt; instead of &amp;lt;code&amp;gt;dump.xml.&amp;lt;/code&amp;gt; You can also compress (Bzip2) the output file by adding &amp;lt;code&amp;gt;--compress&amp;lt;/code&amp;gt; to the command.&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;This will run through all of the articles, get all of the text and put it in wiki.txt. This version also supports compression (BZip2 and Gzip), so you can use &amp;lt;code&amp;gt;dump.xml.bz2&amp;lt;/code&amp;gt; or &amp;lt;code&amp;gt;dump.xml.gz&amp;lt;/code&amp;gt; instead of &amp;lt;code&amp;gt;dump.xml.&amp;lt;/code&amp;gt; You can also compress (Bzip2) the output file by adding &amp;lt;code&amp;gt;--compress&amp;lt;/code&amp;gt; to the command.&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;/table&gt;</summary>
		<author><name>Francis Tyers</name></author>
		
	</entry>
	<entry>
		<id>https://wiki.apertium.org/w/index.php?title=Wikipedia_Extractor&amp;diff=62896&amp;oldid=prev</id>
		<title>Firespeaker at 03:03, 27 April 2017</title>
		<link rel="alternate" type="text/html" href="https://wiki.apertium.org/w/index.php?title=Wikipedia_Extractor&amp;diff=62896&amp;oldid=prev"/>
		<updated>2017-04-27T03:03:11Z</updated>

		<summary type="html">&lt;p&gt;&lt;/p&gt;
&lt;table class=&quot;diff diff-contentalign-left&quot; data-mw=&quot;interface&quot;&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;tr class=&quot;diff-title&quot; lang=&quot;en&quot;&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;← Older revision&lt;/td&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;Revision as of 03:03, 27 April 2017&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 8:&lt;/td&gt;
  &lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 8:&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;This version is much simpler than the old version. This version auto-removes any formatting and only outputs the text to one file. To use it, simply use the following command in your terminal, where dump.xml is the Wikipedia dump&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;This version is much simpler than the old version. This version auto-removes any formatting and only outputs the text to one file. To use it, simply use the following command in your terminal, where dump.xml is the Wikipedia dump&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;br /&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;br /&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;−&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-deletedline diff-side-deleted&quot;&gt;&lt;div&gt;&lt;del class=&quot;diffchange diffchange-inline&quot;&gt;    &lt;/del&gt;$ python3 WikiExtractor.py --infn dump.xml.bz2&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-addedline diff-side-added&quot;&gt;&lt;div&gt;&lt;ins class=&quot;diffchange diffchange-inline&quot;&gt; &lt;/ins&gt;$ python3 WikiExtractor.py --infn dump.xml.bz2&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;br /&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;br /&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-deleted&quot;&gt;&lt;div&gt;This will run through all of the articles, get all of the text and put it in wiki.txt. This version also supports compression (BZip2 and Gzip), so you can use &amp;lt;code&amp;gt;dump.xml.bz2&amp;lt;/code&amp;gt; or &amp;lt;code&amp;gt;dump.xml.gz&amp;lt;/code&amp;gt; instead of &amp;lt;code&amp;gt;dump.xml.&amp;lt;/code&amp;gt; You can also compress (Bzip2) the output file by adding &amp;lt;code&amp;gt;--compress&amp;lt;/code&amp;gt; to the command.&lt;/div&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;
  &lt;td class=&quot;diff-context diff-side-added&quot;&gt;&lt;div&gt;This will run through all of the articles, get all of the text and put it in wiki.txt. This version also supports compression (BZip2 and Gzip), so you can use &amp;lt;code&amp;gt;dump.xml.bz2&amp;lt;/code&amp;gt; or &amp;lt;code&amp;gt;dump.xml.gz&amp;lt;/code&amp;gt; instead of &amp;lt;code&amp;gt;dump.xml.&amp;lt;/code&amp;gt; You can also compress (Bzip2) the output file by adding &amp;lt;code&amp;gt;--compress&amp;lt;/code&amp;gt; to the command.&lt;/div&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;/table&gt;</summary>
		<author><name>Firespeaker</name></author>
		
	</entry>
</feed>