<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"
	xmlns:content="http://purl.org/rss/1.0/modules/content/"
	xmlns:wfw="http://wellformedweb.org/CommentAPI/"
	xmlns:dc="http://purl.org/dc/elements/1.1/"
	xmlns:atom="http://www.w3.org/2005/Atom"
	xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
	xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
	>

<channel>
	<title>Chris Hanretty &#187; rollcall</title>
	<atom:link href="http://chrishanretty.co.uk/blog/index.php/category/rollcall/feed/" rel="self" type="application/rss+xml" />
	<link>http://chrishanretty.co.uk/blog</link>
	<description>Notes on Italian politics and public broadcasting</description>
	<lastBuildDate>Sat, 04 Feb 2012 12:15:14 +0000</lastBuildDate>
	<generator>http://wordpress.org/?v=2.8.5</generator>
	<language>en</language>
	<sy:updatePeriod>hourly</sy:updatePeriod>
	<sy:updateFrequency>1</sy:updateFrequency>
			<item>
		<title>Getting tables out of PDFs in Italy</title>
		<link>http://chrishanretty.co.uk/blog/index.php/2009/09/15/getting-tables-out-of-pdfs-in-italy/</link>
		<comments>http://chrishanretty.co.uk/blog/index.php/2009/09/15/getting-tables-out-of-pdfs-in-italy/#comments</comments>
		<pubDate>Tue, 15 Sep 2009 12:42:12 +0000</pubDate>
		<dc:creator>Chris</dc:creator>
				<category><![CDATA[italy]]></category>
		<category><![CDATA[parliament]]></category>
		<category><![CDATA[rollcall]]></category>

		<guid isPermaLink="false">http://chrishanretty.co.uk/blog/index.php/2009/09/15/getting-tables-out-of-pdfs-in-italy/</guid>
		<description><![CDATA[The Italian Parliament annoys me tremendously. Not for substantial reasons (though it might also annoy me for that reason), but for technical reasons.
They have some nicely formatted XML files for the resoconti (minutes) of each parliamentary sitting.
But their voting information is stuck in crappy PDFs.
Grrr.
So, I have to

download all the PDF files using a horrible [...]]]></description>
			<content:encoded><![CDATA[<p>The Italian Parliament annoys me tremendously. Not for substantial reasons (though it might also annoy me for that reason), but for technical reasons.</p>
<p>They have some nicely formatted XML files for the resoconti (minutes) of each parliamentary sitting.</p>
<p>But their voting information is stuck in crappy PDFs.</p>
<p>Grrr.</p>
<p>So, I have to</p>
<ul>
<li>download all the PDF files using a horrible bash script;</li>
<li>convert them to XML (<code>for file in *.pdf; do pdftohtml -xml "$file"; done</code>)</li>
<li>examine the XML file to find out where the column breaks are</li>
<li>write a perl script to parse the files using this information</li>
</ul>
<p>&#8230;and then merge them.</p>
]]></content:encoded>
			<wfw:commentRss>http://chrishanretty.co.uk/blog/index.php/2009/09/15/getting-tables-out-of-pdfs-in-italy/feed/</wfw:commentRss>
		<slash:comments>3</slash:comments>
		</item>
	</channel>
</rss>

