<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"
	xmlns:content="http://purl.org/rss/1.0/modules/content/"
	xmlns:wfw="http://wellformedweb.org/CommentAPI/"
	xmlns:dc="http://purl.org/dc/elements/1.1/"
	xmlns:atom="http://www.w3.org/2005/Atom"
	xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
	xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
	>

<channel>
	<title>Chris Hanretty &#187; rollcall</title>
	<atom:link href="http://chrishanretty.co.uk/blog/index.php/category/rollcall/feed/" rel="self" type="application/rss+xml" />
	<link>http://chrishanretty.co.uk/blog</link>
	<description>Notes on Italian politics and public broadcasting</description>
	<lastBuildDate>Tue, 07 Sep 2010 14:26:20 +0000</lastBuildDate>
	<generator>http://wordpress.org/?v=2.8.5</generator>
	<language>en</language>
	<sy:updatePeriod>hourly</sy:updatePeriod>
	<sy:updateFrequency>1</sy:updateFrequency>
			<item>
		<title>Getting tables out of PDFs in Italy</title>
		<link>http://chrishanretty.co.uk/blog/index.php/2009/09/15/getting-tables-out-of-pdfs-in-italy/</link>
		<comments>http://chrishanretty.co.uk/blog/index.php/2009/09/15/getting-tables-out-of-pdfs-in-italy/#comments</comments>
		<pubDate>Tue, 15 Sep 2009 12:42:12 +0000</pubDate>
		<dc:creator>Chris</dc:creator>
				<category><![CDATA[italy]]></category>
		<category><![CDATA[parliament]]></category>
		<category><![CDATA[rollcall]]></category>

		<guid isPermaLink="false">http://chrishanretty.co.uk/blog/index.php/2009/09/15/getting-tables-out-of-pdfs-in-italy/</guid>
		<description><![CDATA[The Italian Parliament annoys me tremendously. Not for substantial reasons (though it might also annoy me for that reason), but for technical reasons.
They have some nicely formatted XML files for the resoconti (minutes) of each parliamentary sitting.
But their voting information is stuck in crappy PDFs.
Grrr.
So, I have to

download all the PDF files using a horrible [...]]]></description>
			<content:encoded><![CDATA[<p>The Italian Parliament annoys me tremendously. Not for substantial reasons (though it might also annoy me for that reason), but for technical reasons.</p>
<p>They have some nicely formatted XML files for the resoconti (minutes) of each parliamentary sitting.</p>
<p>But their voting information is stuck in crappy PDFs.</p>
<p>Grrr.</p>
<p>So, I have to</p>
<ul>
<li>download all the PDF files using a horrible bash script;</li>
<li>convert them to XML (<code>for file in *.pdf; do pdftohtml -xml "$file"; done</code>)</li>
<li>examine the XML file to find out where the column breaks are</li>
<li>write a perl script to parse the files using this information</li>
</ul>
<p>&#8230;and then merge them.</p>
]]></content:encoded>
			<wfw:commentRss>http://chrishanretty.co.uk/blog/index.php/2009/09/15/getting-tables-out-of-pdfs-in-italy/feed/</wfw:commentRss>
		<slash:comments>3</slash:comments>
		</item>
	</channel>
</rss>
