<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"
	xmlns:content="http://purl.org/rss/1.0/modules/content/"
	xmlns:wfw="http://wellformedweb.org/CommentAPI/"
	xmlns:dc="http://purl.org/dc/elements/1.1/"
	xmlns:atom="http://www.w3.org/2005/Atom"
	xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
	xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
	xmlns:georss="http://www.georss.org/georss" xmlns:geo="http://www.w3.org/2003/01/geo/wgs84_pos#" xmlns:media="http://search.yahoo.com/mrss/"
	>

<channel>
	<title>SquareCog&#039;s SquareBlog</title>
	<atom:link href="http://squarecog.wordpress.com/feed/" rel="self" type="application/rss+xml" />
	<link>http://squarecog.wordpress.com</link>
	<description></description>
	<lastBuildDate>Wed, 31 Aug 2011 01:30:43 +0000</lastBuildDate>
	<language>en</language>
	<sy:updatePeriod>hourly</sy:updatePeriod>
	<sy:updateFrequency>1</sy:updateFrequency>
	<generator>http://wordpress.com/</generator>
<cloud domain='squarecog.wordpress.com' port='80' path='/?rsscloud=notify' registerProcedure='' protocol='http-post' />
<image>
		<url>http://s2.wp.com/i/buttonw-com.png</url>
		<title>SquareCog&#039;s SquareBlog</title>
		<link>http://squarecog.wordpress.com</link>
	</image>
	<atom:link rel="search" type="application/opensearchdescription+xml" href="http://squarecog.wordpress.com/osd.xml" title="SquareCog&#039;s SquareBlog" />
	<atom:link rel='hub' href='http://squarecog.wordpress.com/?pushpress=hub'/>
		<item>
		<title>Pig trick to register latest version of jar from HDFS</title>
		<link>http://squarecog.wordpress.com/2011/08/30/pig-trick-to-register-latest-version-of-jar-from-hdfs/</link>
		<comments>http://squarecog.wordpress.com/2011/08/30/pig-trick-to-register-latest-version-of-jar-from-hdfs/#comments</comments>
		<pubDate>Wed, 31 Aug 2011 01:30:40 +0000</pubDate>
		<dc:creator>squarecog</dc:creator>
				<category><![CDATA[Uncategorized]]></category>

		<guid isPermaLink="false">http://squarecog.wordpress.com/?p=150</guid>
		<description><![CDATA[%default guavaJar `hadoop fs -ls lib/*guava*jar &#124; awk '{print $8;}' &#124; sort -n &#124; head -1` register 'hdfs://$guavaJar' The same idea also works without HDFS being involved, of course.<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=squarecog.wordpress.com&amp;blog=4884813&amp;post=150&amp;subd=squarecog&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p><code><br />
%default guavaJar `hadoop fs -ls lib/*guava*jar | awk '{print $8;}' | sort -n | head -1`<br />
register 'hdfs://$guavaJar'<br />
</code></p>
<p>The same idea also works without HDFS being involved, of course.</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/squarecog.wordpress.com/150/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/squarecog.wordpress.com/150/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/squarecog.wordpress.com/150/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/squarecog.wordpress.com/150/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/squarecog.wordpress.com/150/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/squarecog.wordpress.com/150/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/squarecog.wordpress.com/150/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/squarecog.wordpress.com/150/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/squarecog.wordpress.com/150/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/squarecog.wordpress.com/150/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/squarecog.wordpress.com/150/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/squarecog.wordpress.com/150/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/squarecog.wordpress.com/150/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/squarecog.wordpress.com/150/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=squarecog.wordpress.com&amp;blog=4884813&amp;post=150&amp;subd=squarecog&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://squarecog.wordpress.com/2011/08/30/pig-trick-to-register-latest-version-of-jar-from-hdfs/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://1.gravatar.com/avatar/7cfa823e92df71d57570952dff52ed2a?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">squarecog</media:title>
		</media:content>
	</item>
		<item>
		<title>Pig Variables and Semicolons</title>
		<link>http://squarecog.wordpress.com/2011/08/24/142/</link>
		<comments>http://squarecog.wordpress.com/2011/08/24/142/#comments</comments>
		<pubDate>Wed, 24 Aug 2011 22:44:52 +0000</pubDate>
		<dc:creator>squarecog</dc:creator>
				<category><![CDATA[programming]]></category>
		<category><![CDATA[hadoop-pig]]></category>

		<guid isPermaLink="false">http://squarecog.wordpress.com/?p=142</guid>
		<description><![CDATA[Pay attention to where you put your semicolons lest they become part of the value! Pig Script: %default QUOTED '20090101'; %default UNQUOTED $QUOTED; l = load '/logs/$QUOTED'; l = load '/logs/$UNQUOTED'; After preprocessing with pig -x local -r script.pig: ; l = load '/logs/20090101'; l = load '/logs/20090101;';<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=squarecog.wordpress.com&amp;blog=4884813&amp;post=142&amp;subd=squarecog&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>Pay attention to where you put your semicolons lest they become part of the value!</p>
<p>Pig Script:<br />
<code><br />
%default QUOTED '20090101';<br />
%default UNQUOTED $QUOTED;</p>
<p>l = load '/logs/$QUOTED';<br />
l = load '/logs/$UNQUOTED';<br />
</code></p>
<p>After preprocessing with <code>pig -x local -r script.pig</code>:</p>
<p><code><br />
;</p>
<p>l = load '/logs/20090101';<br />
l = load '/logs/20090101;';<br />
</code></p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/squarecog.wordpress.com/142/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/squarecog.wordpress.com/142/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/squarecog.wordpress.com/142/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/squarecog.wordpress.com/142/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/squarecog.wordpress.com/142/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/squarecog.wordpress.com/142/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/squarecog.wordpress.com/142/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/squarecog.wordpress.com/142/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/squarecog.wordpress.com/142/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/squarecog.wordpress.com/142/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/squarecog.wordpress.com/142/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/squarecog.wordpress.com/142/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/squarecog.wordpress.com/142/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/squarecog.wordpress.com/142/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=squarecog.wordpress.com&amp;blog=4884813&amp;post=142&amp;subd=squarecog&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://squarecog.wordpress.com/2011/08/24/142/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://1.gravatar.com/avatar/7cfa823e92df71d57570952dff52ed2a?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">squarecog</media:title>
		</media:content>
	</item>
		<item>
		<title>Hadoop requires stable hashCode() implementations</title>
		<link>http://squarecog.wordpress.com/2011/02/20/hadoop-requires-stable-hashcode-implementations/</link>
		<comments>http://squarecog.wordpress.com/2011/02/20/hadoop-requires-stable-hashcode-implementations/#comments</comments>
		<pubDate>Sun, 20 Feb 2011 11:28:38 +0000</pubDate>
		<dc:creator>squarecog</dc:creator>
				<category><![CDATA[Uncategorized]]></category>

		<guid isPermaLink="false">http://squarecog.wordpress.com/?p=132</guid>
		<description><![CDATA[It makes sense, really, once you think about it, but first you have to know to think about it. Let&#8217;s review the basics. To write a Map-Reduce job, you implement a Mapper and a Reducer. The mapper takes in pairs of keys and values, and outputs pairs of keys and values. Both keys and values [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=squarecog.wordpress.com&amp;blog=4884813&amp;post=132&amp;subd=squarecog&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>It makes sense, really, once you think about it, but first you have to know to think about it.</p>
<p>Let&#8217;s review the basics.</p>
<p>To write a Map-Reduce job, you implement a Mapper and a Reducer. The mapper takes in pairs of keys and values, and outputs pairs of keys and values. Both keys and values have to implement the Writable interface, which is how Hadoop deals with serializing them. The keys have to implement WritableComparable, a superset of Writable that also implements, unsurprisingly, Comparable. They have to be Comparable because Hadoop sorts keys for the Reducers.</p>
<p>The important bit here is that a partitioner is used to determine which of the reducers a key-value pair should go to. Most of the time the default HashPartitioner is used. The HashPartitioner is very simple &#8212; for every key, Hadoop invokes hashCode() and routes based on the result. If you just implement the WritableComparable interface, you will inherit Java Object&#8217;s hashCode(). Here&#8217;s a little extract from Object&#8217;s hashCode() javadoc (emphasis mine):</p>
<blockquote><p>
Whenever it is invoked on the same object more than once during an execution of a Java application, the hashCode method must consistently return the same integer, provided no information used in equals comparisons on the object is modified. <strong>This integer need not remain consistent from one execution of an application to another execution of the same application.</strong>
</p></blockquote>
<p>So, we extract some key on mapper 1, extract same key on mapper 2, both get hashed in order to determine what reducer they should go to, and your two entries with the same key go to two different reducers because Object does not guarantee that hashCode() returns the same thing for the same key in two different JVM instances.</p>
<p>Now, this is not a problem most of the time because there are a ton of WritableComparable implementations that have hashCode() that is stable across different JVMs including, of course, all the common ones &#8212; LongWritable, Text, etc. It is, however, a problem is you are wrapping complex Java objects and proxy to their hashCode() implementations. A correct hashCode() implementation <strong><i>does not</i></strong> have to return the same value in different instantiations of the application, and many, in fact, do not. For example, Protocol Buffer messages. Their implementation of hashCode(), while correct, is not stable. Which presents problems if you are trying to wrap them for use as Hadoop Writables. This showed up as <a href="https://github.com/kevinweil/elephant-bird/issues/#issue/28">issue 28</a> in Elephant-Bird, our collection of classes for working with lzo compression, Protocol Buffers, and Thrift objects in Hadoop. The fix is pretty simple &#8212; we just call Arrays.hashCode() on a serialized representation of the message, and make sure the serialization is cached. We&#8217;ll have to serialize it anyway when Hadoop wants to write it out to disk, so there&#8217;s no real overhead.</p>
<pre class="brush: java;">
  /**
   * &lt;p&gt;Returns a hashCode that is stable across multiple instances of JVMs.
   * (&lt;code&gt;hashCode()&lt;/code&gt; is not required to return the same value in
   * different instances of the same applications in Java, just in a
   * single instance of the application; Hadoop imposes a more strict requirement.)
   */
  @Override
  public int hashCode() {
    byte[] bytes = serialize();
    return (bytes == null) ? 31 : Arrays.hashCode(bytes);
  }
</pre>
<p>Now, the fun part. We fixed the bug, now we have to write a test. Our test requires two JVMs &#8212; it does not manifest itself in a single JVM, by its very nature. A few terrible ideas came to mind, like shelling out to a java main through Runtime.exec, but eventually it occurred to me that Apache Ant already does this and there must be something I can use. It&#8217;s a bit gnarly, but here&#8217;s the test &#8212; we use Ant&#8217;s Java class, set it up with the proper environment, and exec a main() in it. </p>
<pre class="brush: java;">
  @Test
  public void testStableHashcodeAcrossJVMs() throws IOException {
    int expectedHashCode = referenceAbWritable.hashCode();
    Java otherJvm = new Java();
    otherJvm.setNewenvironment(true);
    otherJvm.setFork(true);
    otherJvm.setClassname(OtherJvmClass.class.getName());
    for (Map.Entry&lt;String, String&gt; entry : System.getenv().entrySet()) {
      Environment.Variable var = new Environment.Variable();
      var.setKey(entry.getKey());
      var.setValue(entry.getValue());
      otherJvm.addEnv(var);
    }
    for (String prop : System.getProperties().stringPropertyNames()) {
      String propValue = System.getProperty(prop);
      Environment.Variable var = new Environment.Variable();
      var.setKey(prop);
      var.setValue(propValue);
      otherJvm.addSysproperty(var);
    }
    otherJvm.setDir(new File(System.getProperty(&quot;java.io.tmpdir&quot;)));
    File tmpOut = File.createTempFile(&quot;otherJvm&quot;, &quot;txt&quot;);
    otherJvm.setArgs(tmpOut.getAbsolutePath());
    otherJvm.init();
    otherJvm.executeJava();
    DataInputStream is = new DataInputStream(new FileInputStream(tmpOut));
    assertEquals(expectedHashCode, is.readInt());
    is.close();
  }

  public static class OtherJvmClass {
    /* Used for testStableHashcodeAcrossJVMs */
    public static void main(String[] args) throws IOException {
      setUp();
      int hashCode = referenceAbWritable.hashCode();
      File tmpFile = new File(args[0]);
      DataOutputStream os = new DataOutputStream(new FileOutputStream(tmpFile));
      os.writeInt(hashCode);
      os.close();
      System.exit(0);
    }
  }
</pre>
<p>There is probably a better way to communicate between the processes than through a temp file, but nothing terribly obvious showed up when I browsed the javadocs, and this did get the job done. The test reliably fails when applied to the original implementation of ProtobufWritable, and passes in the new version.</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/squarecog.wordpress.com/132/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/squarecog.wordpress.com/132/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/squarecog.wordpress.com/132/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/squarecog.wordpress.com/132/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/squarecog.wordpress.com/132/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/squarecog.wordpress.com/132/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/squarecog.wordpress.com/132/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/squarecog.wordpress.com/132/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/squarecog.wordpress.com/132/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/squarecog.wordpress.com/132/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/squarecog.wordpress.com/132/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/squarecog.wordpress.com/132/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/squarecog.wordpress.com/132/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/squarecog.wordpress.com/132/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=squarecog.wordpress.com&amp;blog=4884813&amp;post=132&amp;subd=squarecog&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://squarecog.wordpress.com/2011/02/20/hadoop-requires-stable-hashcode-implementations/feed/</wfw:commentRss>
		<slash:comments>1</slash:comments>
	
		<media:content url="http://1.gravatar.com/avatar/7cfa823e92df71d57570952dff52ed2a?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">squarecog</media:title>
		</media:content>
	</item>
		<item>
		<title>Incrementing Hadoop Counters in Apache Pig</title>
		<link>http://squarecog.wordpress.com/2010/12/24/incrementing-hadoop-counters-in-apache-pig/</link>
		<comments>http://squarecog.wordpress.com/2010/12/24/incrementing-hadoop-counters-in-apache-pig/#comments</comments>
		<pubDate>Sat, 25 Dec 2010 00:12:19 +0000</pubDate>
		<dc:creator>squarecog</dc:creator>
				<category><![CDATA[programming]]></category>
		<category><![CDATA[hadoop-pig]]></category>
		<category><![CDATA[pig]]></category>

		<guid isPermaLink="false">http://squarecog.wordpress.com/?p=124</guid>
		<description><![CDATA[Information about incrementing Hadoop counters from inside Pig UDFs is not currently well-documented, judging by the user list traffic, so this is a brief note showing how to do that. Hadoop counters are a way to report basic statistics of a job in Hadoop. I won&#8217;t go into a detailed discussion what they are and [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=squarecog.wordpress.com&amp;blog=4884813&amp;post=124&amp;subd=squarecog&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>Information about incrementing Hadoop counters from inside Pig UDFs is not currently well-documented, judging by the user list traffic, so this is a brief note showing how  to do that.</p>
<p>Hadoop counters are a way to report basic statistics of a job in Hadoop. I won&#8217;t go into a detailed discussion what they are and when to use them here &#8212; there&#8217;s plenty of information about that on the internet (for starters, see the <a href="http://www.umiacs.umd.edu/~jimmylin/cloud9/docs/content/counters.html">Cloud9 intro to Counters</a>, and some guidelines for appropriate usage in <a href="http://developer.yahoo.com/blogs/hadoop/posts/2010/08/apache_hadoop_best_practices_a/">&#8220;Apache Hadoop Best Practices and Anti-Patterns&#8221;</a>). </p>
<p><strong>Pig 0.6 and before</strong></p>
<p>Counters were not explicitly supported in Pig 0.6 and before, but you could get at them with this hack (inside a UDF):</p>
<pre class="brush: java;">
Reporter reporter = PigHadoopLogger.getInstance().getReporter()
if (reporter != null) {
  reporter.incrCounter(myEnum, 1L);
}
</pre>
<p><strong>Pig 0.8</strong><br />
Pig 0.8 has an &#8220;official&#8221; method for getting and incrementing counters from a UDF:</p>
<pre class="brush: java;">
PigStatusReporter reporter = PigStatusReporter.getInstance();
if (reporter != null) {
   reporter.getCounter(key).increment(incr);
}
</pre>
<p>You can also get Counters programmatically if you are invoking Pig using PigRunner, and getting a PigStats object on completion. It&#8217;s a bit involved:</p>
<pre class="brush: java;">
PigStats.JobGraph jobGraph = pigStats.getJobGraph();
for (JobStats jobStats :  jobGraph) {
  Counters counters = jobStats.getHadoopCounters();
}
</pre>
<p><strong>Pig 0.7</strong><br />
Unfortunately I don&#8217;t know of a way to do this in 0.7, as the old hack went away and the new PigStatusReporter hadn&#8217;t been added yet. If you have a trick, please comment.</p>
<p><strong>Watch out for nulls</strong><br />
We&#8217;ve observed that sometimes the reporter is null for a bit even when a UDF is executing on the MR side. To deal with this, we added a little helper class <code>PigCounterHelper</code> to <a href="https://github.com/kevinweil/elephant-bird">Elephant-Bird</a> that buffers the writes in a Map, and flushes them when it gets a non-null counter. </p>
<p>So there. If someone asks about counters in Pig, send them here.</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/squarecog.wordpress.com/124/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/squarecog.wordpress.com/124/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/squarecog.wordpress.com/124/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/squarecog.wordpress.com/124/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/squarecog.wordpress.com/124/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/squarecog.wordpress.com/124/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/squarecog.wordpress.com/124/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/squarecog.wordpress.com/124/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/squarecog.wordpress.com/124/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/squarecog.wordpress.com/124/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/squarecog.wordpress.com/124/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/squarecog.wordpress.com/124/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/squarecog.wordpress.com/124/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/squarecog.wordpress.com/124/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=squarecog.wordpress.com&amp;blog=4884813&amp;post=124&amp;subd=squarecog&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://squarecog.wordpress.com/2010/12/24/incrementing-hadoop-counters-in-apache-pig/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://1.gravatar.com/avatar/7cfa823e92df71d57570952dff52ed2a?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">squarecog</media:title>
		</media:content>
	</item>
		<item>
		<title>New Features in Apache Pig 0.8</title>
		<link>http://squarecog.wordpress.com/2010/12/19/new-features-in-apache-pig-0-8/</link>
		<comments>http://squarecog.wordpress.com/2010/12/19/new-features-in-apache-pig-0-8/#comments</comments>
		<pubDate>Sun, 19 Dec 2010 23:03:42 +0000</pubDate>
		<dc:creator>squarecog</dc:creator>
				<category><![CDATA[programming]]></category>
		<category><![CDATA[hadoop-pig]]></category>
		<category><![CDATA[pig]]></category>

		<guid isPermaLink="false">http://squarecog.wordpress.com/?p=104</guid>
		<description><![CDATA[The Pig 0.8 release includes a large number of bug fixes and optimizations, but at the core it is a feature release. It&#8217;s been in the works for almost a full year (most of the work on 0.7 was completed by January of 2009, although it took a while to actually get the release out), [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=squarecog.wordpress.com&amp;blog=4884813&amp;post=104&amp;subd=squarecog&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>The Pig 0.8 release includes a large number of bug fixes and optimizations, but at the core it is a feature release. It&#8217;s been in the works for almost a full year (most of the work on 0.7 was completed by January of 2009, although it took a while to actually get the release out), and the amount of time spent on 0.8 really shows.</p>
<p>I <a href="https://squarecog.wordpress.com/2010/08/20/upcoming-features-in-pig-0-8-dynamic-invokers/">meant</a> to describe these in detail in a series of posts, but it seems blogging regularly is not my forte. This release is so chock-full of great new features, however, that I feel compelled to at least list them. So, behold, in no particular order, a non-exhaustive list of new features I am excited about in Pig 0.8:</p>
<li><strong>Support for UDFs in scripting languages</strong></li>
<p>This is exactly what it sounds like &#8212; if your favorite language has a JVM implementation, it can be used to create Pig UDFs.</p>
<p>Pig now ships with support for UDFs in Jython, but other languages can be supported by implementing a few interfaces. Details about the Pig UDFs in Python can be found here: <a href="http://pig.apache.org/docs/r0.8.0/udf.html#Python+UDFs">http://pig.apache.org/docs/r0.8.0/udf.html#Python+UDFs</a></p>
<p>This is the outcome of <a href="http://issues.apache.org/jira/browse/PIG-928">PIG-928</a>; it was quite a pleasure to watch this develop over time &#8212; while most Pig tickets wind up getting worked on by at most one or two people, this turned into a collaboration of quite a few developers, many of them new to the project &#8212; Kishore Gopalakrishna&#8217;s patch was the initial conversation starter, which was then hacked on or merged into similar work by Woody Anderson, Arnab Nandi, Julien Le Dem, Ashutosh Chauhan and Aniket Mokashi (Aniket deserves an extra shout-out for patiently working to incorporate everyone&#8217;s feedback and pushing the patch through the last mile).</p>
<li><strong>PigUnit</strong></li>
<p>A contribution by Romain Rigaux, PigUnit is exactly what it sounds like &#8212; a tool that simplifies the Pig users&#8217; lives by giving them a simple way to unit test Pig scripts.</p>
<p>The documentation at <a href="http://pig.apache.org/docs/r0.8.0/pigunit.html">http://pig.apache.org/docs/r0.8.0/pigunit.html</a> and the code at <a href="http://svn.apache.org/viewvc/pig/trunk/test/org/apache/pig/test/pigunit/TestPigTest.java?view=markup">http://svn.apache.org/viewvc/pig/trunk/test/org/apache/pig/test/pigunit/TestPigTest.java?view=markup</a> speak for themselves as far as usage.</p>
<li><strong>PigStats</strong></li>
<p>Pig can now provide much better visibility into what is going on inside a Pig job than it ever did before, thanks to extensive work by Richard Ding (see <a href="http://issues.apache.org/jira/browse/PIG-1333">PIG-1333</a> and <a href="http://issues.apache.org/jira/browse/PIG-1478">PIG-1478</a>). This feature is a feature in three parts:</p>
<p>1. Script statistics.<br />
This is the most easily visible change. At the end of running a script, Pig will output a table with some basic statistics regarding the jobs that it ran. It looks something like this:</p>
<p>Job Stats (time in seconds):</p>
<table>
<tr>
<td>JobId</td>
<td>Maps</td>
<td>Reduces</td>
<td>Max<br />Map<br />Time
<td>Min<br />Map<br />Time</td>
<td>Avg<br />Map<br />Time</td>
<td>Max<br />Reduce<br />Time</td>
<td>Min<br />Reduce<br />Time</td>
<td>Avg<br />Reduce<br />Time</td>
<td>Alias</td>
<td>Feature</td>
<td>Outputs</td>
</tr>
<tr>
<td>job_xxx</td>
<td>1654</td>
<td>218</td>
<td>84	</td>
<td>6</td>
<td>14</td>
<td>107</td>
<td>87</td>
<td>99</td>
<td>counted_data,<br />data,<br />grouped_data</td>
<td>	GROUP_BY,<br />COMBINER</td>
<td></td>
</tr>
<tr>
<td>job_xxx</td>
<td>2</td>
<td>1</td>
<td>9</td>
<td>6</td>
<td>7</td>
<td>13</td>
<td>13</td>
<td>13</td>
<td>ordered_data</td>
<td>SAMPLER</td>
<td></td>
</tr>
<tr>
<td>job_xxx</td>
<td>2</td>
<td>1</td>
<td>26</td>
<td>18</td>
<td>22</td>
<td>31</td>
<td>31</td>
<td>31</td>
<td>ordered_data</td>
<td>ORDER_BY</td>
<td>hdfs://tmp/out,</td>
</tr>
</table>
<p>This is extremely useful when debugging slow jobs, as you can immediately identify which stages of your script are slow, and correlate the slow Map-Reduce jobs with the actual Pig operators and relations in your script &#8212; something that was not trivial before (folks often resorted to setting parallelism to slightly different numbers for every join and group just to figure out which job was doing what. No more of this!)</p>
<p>2. Data in Job XML</p>
<p>Pig now inserts several interesting properties into the Hadoop jobs that it generates, including the relations being generated, Pig features being used, and ids of parent Hadoop jobs. This is quite helpful when monitoring a cluster, and is also handy when examining job history using the HadoopJobHistoryLoader , now part of piggybank (use Pig to mine your job history!).</p>
<p>3. PigRunner API</p>
<p>The same information that is printed out when Pig runs the script from a command line is available if one uses the Java API to start Pig jobs. If you start a script using the <code>PigRunner.run(String args[], ProgressNotificationListener listener)</code>, you will get as a result a <a href="http://pig.apache.org/docs/r0.8.0/api/org/apache/pig/tools/pigstats/PigStats.html">PigStats</a> object that gives you access to the job hierarchy, the Hadoop counters from each job, and so on. You can implement the optional <a href="http://pig.apache.org/docs/r0.8.0/api/org/apache/pig/tools/pigstats/PigProgressNotificationListener.html">ProgressNotificationListener</a> if you want to watch the job as it progresses; the listener will be notified as different component jobs start and finish.</p>
<p>Documentation of the API, new properties in the Job XML, and more, is available at <a href="http://pig.apache.org/docs/r0.8.0/piglatin_ref1.html#Pig+Statistics">http://pig.apache.org/docs/r0.8.0/piglatin_ref1.html#Pig+Statistics</a></p>
<li><strong>Scalar values</strong></li>
<p>It&#8217;s very common to need to use some calculated statistic in a calculation to inform other calculations. For example, consider a data set that consists of people and their eye color; we want to calculate the fraction of the total population that has a given eye color. The script looks something like this:</p>
<p><code>
<pre>
people = LOAD '/data/people' using PigStorage()
  AS (person_id:long, eye_color:chararray);
num_people = FOREACH (group people all)
  GENERATE COUNT(people) AS total;
eye_color_fractions = FOREACH ( GROUP people BY eye_color )
  GENERATE
    group as eye_color,
    COUNT(people) / num_people.total AS fraction;
</pre>
<p></code></p>
<p>Pretty straightforward, except it does not work. What&#8217;s happening in the above code is that we are referencing the relation <code>num_people</code> from inside another relation, <code>eye_color_fractions</code> and this doesn&#8217;t really make sense if Pig does not know that <code>num_people</code> only has one row.  </p>
<p>In the past you had to do something hacky like joining the two relations on a constant to replicate the total into each row, and then generate the division. Needless to say, this was not entirely satisfactory. In <a href="http://issues.apache.org/jira/browse/PIG-1434">PIG-1434</a> Aniket Mokashi tackled this, implementing an elegant solution that hides all of these details from the user &#8212; you can now simply cast a single-row relation as a scalar, and use it as desired. The above script becomes:</p>
<p><code>
<pre>
people = LOAD '/data/people' using PigStorage()
  AS (person_id:long, eye_color:chararray);
num_people = FOREACH (group people all)
  GENERATE COUNT(people) AS total;
eye_color_fractions = FOREACH ( GROUP people BY eye_color )
  GENERATE
    group as eye_color,
    COUNT(people) / <strong>(long)</strong> num_people.total AS fraction;
</pre>
<p></code></p>
<p>This makes the casting explicit, but Pig is now smart enough to do this implicitly as well. A runtime exception is generated if the relation being used as a scalar contains more than one tuple.</p>
<p>More documentation of this feature is available at <a href="http://pig.apache.org/docs/r0.8.0/piglatin_ref2.html#Casting+Relations+to+Scalars">http://pig.apache.org/docs/r0.8.0/piglatin_ref2.html#Casting+Relations+to+Scalars</a></p>
<li><strong>Monitored UDFs</strong></li>
<p>A new annotation has been added, <code>@MonitoredUDF</code>, which makes Pig spawn a watcher thread that kills an execution that is taking too long, and return a default value instead. This comes in handy when dealing with certain operations like complex regular expressions. More documentation is available at <a href="http://pig.apache.org/docs/r0.8.0/udf.html#Monitoring+long-running+UDFs">http://pig.apache.org/docs/r0.8.0/udf.html#Monitoring+long-running+UDFs</a></p>
<li><strong>Automatic merge of small files</strong></li>
<p>This is a simple one, but useful &#8212; when running Pig over many small files, instead of creating a map task per file (paying the overhead of scheduling and running a task for a computation that might only take a few seconds), we can merge the inputs and create a few map tasks that are a bit more hefty.</p>
<p>Two properties control this behavior: <code>pig.maxCombinedSplitSize</code> controls the maximum size of the resulting split, and <code>pig.splitCombination</code> controls whether or not the feature is activated in the first place (it is on by default).</p>
<p>This work is documented in the ticket <a href="http://issues.apache.org/jira/browse/PIG-1518">PIG-1518</a>; there are additional details in the release notes attached to the ticket.</p>
<li><strong>Generic UDFs</strong></li>
<p>I <a href="https://squarecog.wordpress.com/2010/08/20/upcoming-features-in-pig-0-8-dynamic-invokers/">wrote about this one</a> before &#8212; a small feature that allows you to invoke static Java methods as Pig UDFs without needing to wrap them in custom code. </p>
<p>The official documentation is available at <a href="http://pig.apache.org/docs/r0.8.0/piglatin_ref1.html#Dynamic+Invokers">http://pig.apache.org/docs/r0.8.0/piglatin_ref1.html#Dynamic+Invokers</a></p>
<li><strong>Safeguards against missing PARALLEL keyword</strong></li>
<p>One of the more common mistakes people make when writing Pig scripts is forgetting to specify parallelism for operators that need it. The default behavior used to be that this means parallelism of 1, which can lead to extremely inefficient jobs. A patch by Jeff Zhang in <a href="http://issues.apache.org/jira/browse/PIG-1249">PIG-1249</a> changes this behavior to instead use a simple heuristic: if parallelism is not specified, derive the number of reducers by taking <code>MIN(max_reducers, total_input_size / bytes_per_reducer)</code>. Max number of reducers is controlled by the property <code>pig.exec.reducers.max</code> (default 999) and bytes per reducer are controlled by <code>pig.exec.reducers.bytes.per.reducer</code> (default 1GB).</p>
<p>This is a safeguard, not a panacea; it only works with file-based input, estimates number of reducers based on input size, not the size of the intermediate data &#8212; so if you have a highly selective filter, or you are grouping a large dataset by a low-cardinality field, it will produce bad number &#8212; but it&#8217;s a nice safeguard against dramatic misconfigurations. </p>
<blockquote><p>When porting to Apache Pig 0.8, remember to audit your scripts for parallelized operators that do not specify the <code>PARALLEL</code> keyword &#8212; if the intent is to use a single reducer, make this intent explicit by specifying <code>PARALLEL 1</code>.</p></blockquote>
<li><strong>HBaseStorage</strong></li>
<p>HBaseStorage has been shored up in Pig 0.8. It can now read data stored in as bytes instead of requiring all numbers to be converted to Strings; it accepts a number of options &#8212; limit the number of rows returned, push down filters on HBase keys, etc. In addition, it can now be used to write to HBase in addition to reading from it. Details about the options, etc, can be found in the Release Notes section of <a href="http://issues.apache.org/jira/browse/PIG-1205">PIG-1205</a>.</p>
<p>Note that at the moment this only works with the HBase 0.20.{4,5,6} releases, and does not work with 0.89+. There is a patch in <a href="http://issues.apache.org/jira/browse/PIG-1680">PIG-1680</a> that you can apply if you need 0.89 and 0.90 compatibility; it is not applied to the main codebase yet, as it is not backwards compatible.</p>
<p>We are very interested in help making this Storage engine more featureful, please feel free to jump in and contribute!</p>
<li><strong>Support for custom Map-Reduce jobs in the flow</strong></li>
<p>Although we try to make these a rarity, sometimes cases come up in which a custom Map-Reduce job fits the bill better than Pig. Weaving a Map-Reduce job into the middle of a Pig workflow was awkward before &#8212; you had to use something like Oozie or Azkaban, or write your own workflow application. Pig 0.8 introduces a simple &#8220;MAPREDUCE&#8221; operator which allows you to invoke an opaque MR job in the middle of the flow, and continue with Pig:<br />
<code>
<pre>
text = load 'WordcountInput.txt';
wordcount = MAPREDUCE wordcount.jar
  STORE text INTO 'inputDir'
  LOAD 'outputDir' AS (word:chararray, count: int)
  `org.myorg.WordCount inputDir outputDir`;
</pre>
<p></code></p>
<p>Details are available on the wiki page: <a href="http://wiki.apache.org/pig/NativeMapReduce">http://wiki.apache.org/pig/NativeMapReduce</a></p>
<p>The ticket for this one has been open for a while, since Pig 0.2 days, and it&#8217;s nice to see it finally implemented. Thumbs up to Aniket Mokashi for this one.</p>
<li><strong>Custom Partitioners</strong></li>
<p>This feature, also implemented by the amazingly productive Aniket Mokashi, is also a bit of a power-user thing (and also an ancient ticket, PIG-282). It allows the Pig script author to control the function used to distribute map output among reducers. By default, Pig uses a random hash partitioner, but sometimes a custom algorithm is required when the script author knows something particularly unique about the reduce key distribution. When that is the case, a user can now specify the Hadoop Partitioner to swap in instead of the default:</p>
<p><code>B = group A by $0 PARTITION BY org.apache.pig.test.utils.SimpleCustomPartitioner parallel 2; </code></p>
<p>More specific documentation can be found in the Release Notes section of <a href="http://issues.apache.org/jira/browse/PIG-282">PIG-282</a></p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/squarecog.wordpress.com/104/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/squarecog.wordpress.com/104/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/squarecog.wordpress.com/104/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/squarecog.wordpress.com/104/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/squarecog.wordpress.com/104/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/squarecog.wordpress.com/104/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/squarecog.wordpress.com/104/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/squarecog.wordpress.com/104/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/squarecog.wordpress.com/104/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/squarecog.wordpress.com/104/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/squarecog.wordpress.com/104/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/squarecog.wordpress.com/104/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/squarecog.wordpress.com/104/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/squarecog.wordpress.com/104/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=squarecog.wordpress.com&amp;blog=4884813&amp;post=104&amp;subd=squarecog&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://squarecog.wordpress.com/2010/12/19/new-features-in-apache-pig-0-8/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://1.gravatar.com/avatar/7cfa823e92df71d57570952dff52ed2a?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">squarecog</media:title>
		</media:content>
	</item>
		<item>
		<title>Upcoming Features in Pig 0.8: Dynamic Invokers</title>
		<link>http://squarecog.wordpress.com/2010/08/20/upcoming-features-in-pig-0-8-dynamic-invokers/</link>
		<comments>http://squarecog.wordpress.com/2010/08/20/upcoming-features-in-pig-0-8-dynamic-invokers/#comments</comments>
		<pubDate>Fri, 20 Aug 2010 08:28:31 +0000</pubDate>
		<dc:creator>squarecog</dc:creator>
				<category><![CDATA[programming]]></category>
		<category><![CDATA[hadoop]]></category>
		<category><![CDATA[hadoop-pig]]></category>
		<category><![CDATA[pig]]></category>

		<guid isPermaLink="false">http://squarecog.wordpress.com/?p=91</guid>
		<description><![CDATA[Pig release 0.8 is scheduled to be feature-frozen and branched at the end of August 2010. This release has many, many useful new features, mostly addressing usability. In this series of posts, I will demonstrate some of my favorites from this release. Pig 0.8 will have a family of built-in UDFs called Dynamic Invokers. The [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=squarecog.wordpress.com&amp;blog=4884813&amp;post=91&amp;subd=squarecog&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<blockquote><p>
Pig release 0.8 is scheduled to be feature-frozen and branched at the end of August 2010. This release has many, many useful new features, mostly addressing usability. In this series of posts, I will demonstrate some of my favorites from this release.
</p></blockquote>
<p>Pig 0.8 will have a family of built-in UDFs called Dynamic Invokers. The idea is simple: frequently, Pig users need to use a simple function that is already provided by standard Java libraries, but for which a UDF has not been written. Dynamic Invokers allow a Pig programmer to refer to Java functions without having to wrap them in custom Pig UDFs, at the cost of doing some Java reflection on every function call.<br />
<span id="more-91"></span></p>
<h3>An example.</h3>
<p>Let&#8217;s start off with a quick motivation example. Imagine we have a bunch of URL-encoded strings which we want to decode. In Java, this is done by simply calling:
<pre class="brush: java;">String decoded = URLDecoder.decode(encoded, &quot;UTF-8&quot;);</pre>
<p>In Pig, there is no built-in function to do this, but it&#8217;s easy enough to write your own, wrapping the URLDecoder function:</p>
<pre class="brush: java;">
package org.squarecog.pig;

import java.io.IOException;
import java.net.URLDecoder;

import org.apache.pig.EvalFunc;
import org.apache.pig.data.Tuple;

public class UrlDecode extends EvalFunc&lt;String&gt; {

    @Override
    public String exec(Tuple input) throws IOException {
        String encoded = (String) input.get(0);
        String encoding = (String) input.get(1);
        return URLDecoder.decode(encoded, encoding);
    }
}
</pre>
<p>This is about the least amount of code you can get away with &#8212; it doesn&#8217;t check for failing casts, non-existing fields, and all kinds of other problems, but it does the job most of the time. Having written this class, the next step would be to compile it, test it, package it into a jar, and now the decoder is ready to be used in Pig:</p>
<pre class="brush: sql;">
REGISTER squarecogs_pig_stuff.jar;

encoded_strings = LOAD 'encoded_strings.txt' as (encoded:chararray);
decoded_strings = FOREACH encoded_strings GENERATE org.squarecog.pig.UrlDecode(encoded, 'UTF-8');
</pre>
<p><strong>What a pain.</strong> There must be an easier way, right? Well, now there is. With Pig 0.8 all you have to do is put this in your Pig script:</p>
<pre class="brush: sql;">
DEFINE UrlDecode InvokeForString('java.net.URLDecoder.decode', 'String String');
encoded_strings = LOAD 'encoded_strings.txt' as (encoded:chararray);
decoded_strings = FOREACH encoded_strings GENERATE UrlDecode(encoded, 'UTF-8');
</pre>
<p><strong>That&#8217;s it. No Java, no compilation.</strong> Just use it.</p>
<h3>Usage</h3>
<p>Currently, Dynamic Invokers can be used for any static function that accepts no arguments or some combination of Strings, ints, longs, doubles, floats, or arrays of same, and returns a String, an int, a long, a double, or a float. Primitives only for the numbers, no capital-letter numeric classes as arguments. Depending on the return type, a specific kind of Invoker must be used: InvokeForString, InvokeForInt, InvokeForLong, InvokeForDouble, or InvokeForFloat.</p>
<p>The DEFINE keyword is used to bind a keyword to a Java method, as above. The first argument to the InvokeFor* constructor is the full path to the desired method. The second argument is a space-delimited ordered list of the classes of the method arguments. This can be omitted or an empty string if the method takes no arguments. Valid class names are String, Long, Float, Double, and Int. Invokers can also work with array arguments, represented in Pig as DataBags of single-tuple elements. Simply refer to <code>string[]</code>, for example. Class names are not case-sensitive.</p>
<h3>Speed</h3>
<p>I tested the speed of these Invokers by using them to take log of the numbers from 0 to 1,000,000 in a tight loop. For this experiment, using the dynamic InvokeForDouble UDF was about twice as slow as using the Log UDF directly. I find this to be an acceptable cost to pay for the speed and convenience of development when writing prototypes and one-off exploratory scripts. Naturally, if you are trying to squeeze all the performance that&#8217;s possible out of your scripts, you should use regular UDFs.</p>
<h3>Arrays</h3>
<p>As mentioned, Pig 0.8 invokers will support array arguments. This makes methods like those in <a href="http://commons.apache.org/math/apidocs/org/apache/commons/math/stat/StatUtils.html">org.apache.commons.math.stat.StatUtils</a> available for processing the results of grouping your datasets, for example. This is very nice, but a word of caution: the resulting UDF will of course not be optimized for Hadoop, and the very significant benefits one gains from implementing the Algebraic and Accumulative interfaces are lost here. Be careful with this one.</p>
<h3>Future Work</h3>
<p>If people find these Invokers useful, more features can be added, such as support for booleans, bytes, and the various Number classes (rather than just primitives). Let me know what you would like to see, either in the comments, or, even better, on the Pig user mailing list.</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/squarecog.wordpress.com/91/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/squarecog.wordpress.com/91/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/squarecog.wordpress.com/91/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/squarecog.wordpress.com/91/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/squarecog.wordpress.com/91/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/squarecog.wordpress.com/91/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/squarecog.wordpress.com/91/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/squarecog.wordpress.com/91/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/squarecog.wordpress.com/91/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/squarecog.wordpress.com/91/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/squarecog.wordpress.com/91/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/squarecog.wordpress.com/91/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/squarecog.wordpress.com/91/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/squarecog.wordpress.com/91/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=squarecog.wordpress.com&amp;blog=4884813&amp;post=91&amp;subd=squarecog&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://squarecog.wordpress.com/2010/08/20/upcoming-features-in-pig-0-8-dynamic-invokers/feed/</wfw:commentRss>
		<slash:comments>2</slash:comments>
	
		<media:content url="http://1.gravatar.com/avatar/7cfa823e92df71d57570952dff52ed2a?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">squarecog</media:title>
		</media:content>
	</item>
		<item>
		<title>Pig, HBase, Hadoop, and Twitter: HUG talk slides</title>
		<link>http://squarecog.wordpress.com/2010/05/20/pig-hbase-hadoop-and-twitter-hug-talk-slides/</link>
		<comments>http://squarecog.wordpress.com/2010/05/20/pig-hbase-hadoop-and-twitter-hug-talk-slides/#comments</comments>
		<pubDate>Thu, 20 May 2010 07:01:23 +0000</pubDate>
		<dc:creator>squarecog</dc:creator>
				<category><![CDATA[programming]]></category>
		<category><![CDATA[hadoop]]></category>
		<category><![CDATA[hadoop-pig]]></category>
		<category><![CDATA[pig]]></category>
		<category><![CDATA[presentations]]></category>

		<guid isPermaLink="false">http://squarecog.wordpress.com/?p=84</guid>
		<description><![CDATA[I presented tonight at the Bay Area Hadoop User Group, talking briefly about Twitter&#8217;s use of Hadoop and Pig. Here are the slides:<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=squarecog.wordpress.com&amp;blog=4884813&amp;post=84&amp;subd=squarecog&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>I presented tonight at the Bay Area Hadoop User Group, talking briefly about Twitter&#8217;s use of Hadoop and Pig. Here are the slides:<br />
<iframe class="scribd_iframe_embed" src="http://www.scribd.com/embeds/31652181/content?start_page=1&view_mode=slideshow&access_key=key-1p43i2xdlkk4ps79sg10" data-auto-height="true" scrolling="no" id="scribd_31652181" width="100%" height="500" frameborder="0"></iframe>
<div style="font-size:10px;text-align:center;width:100%"><a href="http://www.scribd.com/doc/31652181">View this document on Scribd</a></div></p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/squarecog.wordpress.com/84/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/squarecog.wordpress.com/84/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/squarecog.wordpress.com/84/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/squarecog.wordpress.com/84/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/squarecog.wordpress.com/84/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/squarecog.wordpress.com/84/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/squarecog.wordpress.com/84/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/squarecog.wordpress.com/84/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/squarecog.wordpress.com/84/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/squarecog.wordpress.com/84/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/squarecog.wordpress.com/84/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/squarecog.wordpress.com/84/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/squarecog.wordpress.com/84/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/squarecog.wordpress.com/84/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=squarecog.wordpress.com&amp;blog=4884813&amp;post=84&amp;subd=squarecog&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://squarecog.wordpress.com/2010/05/20/pig-hbase-hadoop-and-twitter-hug-talk-slides/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://1.gravatar.com/avatar/7cfa823e92df71d57570952dff52ed2a?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">squarecog</media:title>
		</media:content>
	</item>
		<item>
		<title>GROUP operator in Apache Pig</title>
		<link>http://squarecog.wordpress.com/2010/05/11/group-operator-in-apache-pig/</link>
		<comments>http://squarecog.wordpress.com/2010/05/11/group-operator-in-apache-pig/#comments</comments>
		<pubDate>Tue, 11 May 2010 15:36:41 +0000</pubDate>
		<dc:creator>squarecog</dc:creator>
				<category><![CDATA[programming]]></category>
		<category><![CDATA[hadoop]]></category>
		<category><![CDATA[hadoop-pig]]></category>
		<category><![CDATA[pig]]></category>

		<guid isPermaLink="false">http://squarecog.wordpress.com/?p=68</guid>
		<description><![CDATA[I&#8217;ve been doing a fair amount of helping people get started with Apache Pig. One common stumbling block is the GROUP operator. Although familiar, as it serves a similar function to SQL&#8217;s GROUP operator, it is just different enough in the Pig Latin language to be confusing. Hopefully this brief post will shed some light [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=squarecog.wordpress.com&amp;blog=4884813&amp;post=68&amp;subd=squarecog&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>I&#8217;ve been doing a fair amount of helping people get started with Apache Pig. One common stumbling block is the GROUP operator. Although familiar, as it serves a similar function to SQL&#8217;s GROUP operator, it is just different enough in the Pig Latin language to be confusing. Hopefully this brief post will shed some light on what exactly is going on.<br />
<span id="more-68"></span></p>
<h4>Basic Usage</h4>
<p>Let us start by loading up some data:</p>
<pre class="brush: sql;">
my_data = LOAD '/data/customers' using PigStorage()
  as (name:chararray, age:int, eye_color:chararray, height:int);
</pre>
<p>If we want to compute some aggregates from this data, we might want to group the rows into buckets over which we will run the aggregate functions:</p>
<pre class="brush: sql;">
by_age = GROUP my_data BY age;
by_age_and_color = GROUP my_data BY (age, eye_color);
</pre>
<h4>Resulting Schema</h4>
<p>When you group a relation, the result is a new relation with two columns: &#8220;group&#8221; and the name of the original relation. The group column has the schema of what you grouped by. If you grouped by an integer column, for example, as in the first example, the type will be int. If you grouped by a tuple of several columns, as in the second example, the &#8220;group&#8221; column will be a tuple with two fields, &#8220;age&#8221; and &#8220;eye_color&#8221;.</p>
<p>They can be retrieved by flattening &#8220;group&#8221;, or by directly accessing them: &#8220;group.age, group.eye_color&#8221;:</p>
<pre class="brush: sql;">
-- using FLATTEN
age_and_color = FOREACH by_age_and_color GENERATE FLATTEN(group) as (age, color);

-- or using explicit projections
age_and_ccolor = FOREACH by_age_and_color GENERATE group.age, group.color;
</pre>
<p>Note that using the FLATTEN operator is preferable since it allows algebraic optimizations to work &#8212; but that&#8217;s a subject for another post.</p>
<p>The second column will be named after the original relation, and contain a <i>bag</i> of all the rows in the original relation that match the corresponding group. The rows are unaltered &#8212; they are the same as they were in the original table that you grouped. </p>
<p>As a side note, Pig also provides a handy operator called COGROUP, which essentially performs a join and a group at the same time. The syntax is as follows:</p>
<pre class="brush: sql;">
cogrouped_data = COGROUP data1 on id, data2 on user_id;
</pre>
<p>The resulting schema will be the group as described above, followed by two columns &#8212; data1 and data2, each containing bags of tuples with the given group key. This is very useful if you intend to join and group on the same key, as it saves you a whole Map-Reduce stage.</p>
<h4>Processing the results</h4>
<p>To work on the results of the group operator, you will want to use a FOREACH. This is a simple loop construct that works on a relation one row at a time. You can apply it to any relation, but it&#8217;s most frequently used on results of grouping, as it allows you to apply aggregation functions to the collected bags.</p>
<p>Referring to somebag.some_field in a FOREACH operator essentially means &#8220;for each tuple in the bag, give me some_field in that tuple&#8221;.  So you can do things like</p>
<pre class="brush: sql;">
age_counts = FOREACH by_age GENERATE
  group as age,  -- the key you grouped on
 COUNT(my_data), -- the number of people with this age
 MAX(my_data.height); -- the maximum height of people with this age
</pre>
<p>Note that all the functions in this example are aggregates. That&#8217;s because they are things we can do to a <i>collection of values</i>. Folks sometimes try to apply single-item operations in a foreach &#8212; like transforming strings or checking for specific values of a field. Remember, my_data.height doesn&#8217;t give you a single height element &#8212; it gives you all the heights of all people in a given age group.</p>
<h4>Multiple dimensions</h4>
<p>It is common to need counts by multiple dimensions; in our running example, we might want to get not just the maximum or the average height of all people in a given age category, but also the number of people in each age category with a certain eye color. There are a few ways two achieve this, depending on how you want to lay out the results.</p>
<p>The simplest is to just group by both age and eye color:</p>
<pre class="brush: sql;">
by_age_color = GROUP my_data BY (age, eye_color);

-- count colors separately
by_age_color_counts = FOREACH by_age_color GENERATE
    FLATTEN(group) AS (age, eye_color),
    AVG(my_data.height) as age_color_height_avg,
    COUNT(my_data) AS age_color_count;
</pre>
<p>From there, you can group by_age_color_counts again and get your by-age statistics.</p>
<p>If you have a set list of eye colors, and you want the eye color counts to be columns in the resulting table, you can do the following:</p>
<pre class="brush: sql;">
-- break out the counts
my_data = FOREACH my_data
               GENERATE name, age, height,
               (eye_color == 'brown' ? 1 : 0) AS brown_eyes,
               (eye_color == 'blue'  ? 1 : 0) AS blue_eyes,
               (eye_color = 'green' ? 1 : 0 ) AS green_eyes;

-- group and generate
by_age = group my_data by age;
final_data = FOREACH by_age GENERATE
    group as age,
    COUNT(my_data) as num_people,
    AVG(my_data.height) as avg_height,
    SUM(brown_eyes) as num_brown_eyes,
    SUM(blue_eyes) as num_blue_eyes,
    SUM(green_eyes) as num_green_eyes;
</pre>
<h4>Advanced Topics</h4>
<p>A few notes on more advanced topics, which perhaps should warrant a more extensive treatment in a separate post.</p>
<p>The GROUP operator in Pig is a &#8216;blocking&#8217; operator, and forces a Hdoop Map-Reduce job. All the data is shuffled, so that rows in different partitions (or &#8220;slices&#8221;, if you prefer the pre-Pig 0.7 terminology) that have the same grouping key wind up together. Therefore, grouping has non-trivial overhead, unlike operations like filtering or projecting. Consider this when putting together your pipelines.</p>
<p>If you need to calculate statistics on multiple different groupings of the data, it behooves one to take advantage of Pig&#8217;s multi-store optimization, wherein it will find opportunities to share work between multiple calculations.</p>
<p>When groups grow too large, they can cause significant memory issues on reducers; they can lead to hot spots, and all kinds of other badness. Look up algebraic and accumulative EvalFunc interfaces in the Pig documentation, and try to use them to avoid this problem when possible. Check the execution plan (using the &#8216;explain&#8221; command) to make sure the algebraic and accumulative optimizations are used.</p>
<p>Pig 0.7 introduces an option to group on the map side, which you can invoke when you know that all of your keys are guaranteed to be on the same partition. Consider it when this condition applies.</p>
<h4>Cleanup()</h4>
<p>So there you have it, a somewhat ill-structured brain dump about the GROUP operator in Pig. I hope it helps folks &#8212; if something is confusing, please let me know in the comments!</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/squarecog.wordpress.com/68/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/squarecog.wordpress.com/68/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/squarecog.wordpress.com/68/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/squarecog.wordpress.com/68/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/squarecog.wordpress.com/68/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/squarecog.wordpress.com/68/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/squarecog.wordpress.com/68/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/squarecog.wordpress.com/68/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/squarecog.wordpress.com/68/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/squarecog.wordpress.com/68/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/squarecog.wordpress.com/68/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/squarecog.wordpress.com/68/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/squarecog.wordpress.com/68/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/squarecog.wordpress.com/68/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=squarecog.wordpress.com&amp;blog=4884813&amp;post=68&amp;subd=squarecog&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://squarecog.wordpress.com/2010/05/11/group-operator-in-apache-pig/feed/</wfw:commentRss>
		<slash:comments>3</slash:comments>
	
		<media:content url="http://1.gravatar.com/avatar/7cfa823e92df71d57570952dff52ed2a?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">squarecog</media:title>
		</media:content>
	</item>
		<item>
		<title>Twitter Lists as Tags</title>
		<link>http://squarecog.wordpress.com/2009/11/11/twitter-lists-as-tags/</link>
		<comments>http://squarecog.wordpress.com/2009/11/11/twitter-lists-as-tags/#comments</comments>
		<pubDate>Wed, 11 Nov 2009 19:03:56 +0000</pubDate>
		<dc:creator>squarecog</dc:creator>
				<category><![CDATA[Uncategorized]]></category>

		<guid isPermaLink="false">http://squarecog.wordpress.com/?p=63</guid>
		<description><![CDATA[I created a toy script that is based on the idea that when people list twitter users in lists, they are implicitly tagging those users. Using the Twitter API, it&#8217;s dead simple to extract the lists a user belongs to, do a tiny bit of parsing, and visualize the tags in a word cloud. So [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=squarecog.wordpress.com&amp;blog=4884813&amp;post=63&amp;subd=squarecog&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>I created a toy script that is based on the idea that when people list twitter users in lists, they are implicitly tagging those users. Using the Twitter API, it&#8217;s dead simple to extract the lists a user belongs to, do a tiny bit of parsing, and visualize the tags in a word cloud. So I did it. Let me know what you think: <a href="http://www.squarecog.com/twittags">TwitTags</a>.<br />
<a href="http://www.squarecog.com/twittags"><img src="http://squarecog.files.wordpress.com/2009/11/squarecog-wordle.png?w=500&#038;h=290" alt="Wordle for Squarecog tags" title="squarecog-wordle" width="500" height="290" class="aligncenter size-full wp-image-64" /></a></p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/squarecog.wordpress.com/63/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/squarecog.wordpress.com/63/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/squarecog.wordpress.com/63/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/squarecog.wordpress.com/63/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/squarecog.wordpress.com/63/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/squarecog.wordpress.com/63/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/squarecog.wordpress.com/63/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/squarecog.wordpress.com/63/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/squarecog.wordpress.com/63/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/squarecog.wordpress.com/63/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/squarecog.wordpress.com/63/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/squarecog.wordpress.com/63/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/squarecog.wordpress.com/63/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/squarecog.wordpress.com/63/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=squarecog.wordpress.com&amp;blog=4884813&amp;post=63&amp;subd=squarecog&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://squarecog.wordpress.com/2009/11/11/twitter-lists-as-tags/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://1.gravatar.com/avatar/7cfa823e92df71d57570952dff52ed2a?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">squarecog</media:title>
		</media:content>

		<media:content url="http://squarecog.files.wordpress.com/2009/11/squarecog-wordle.png" medium="image">
			<media:title type="html">squarecog-wordle</media:title>
		</media:content>
	</item>
		<item>
		<title>Presentation on Apache Pig at Pittsburgh Hadoop User Group</title>
		<link>http://squarecog.wordpress.com/2009/11/03/apache-pig-apittsburgh-hadoop-user-group/</link>
		<comments>http://squarecog.wordpress.com/2009/11/03/apache-pig-apittsburgh-hadoop-user-group/#comments</comments>
		<pubDate>Wed, 04 Nov 2009 03:04:42 +0000</pubDate>
		<dc:creator>squarecog</dc:creator>
				<category><![CDATA[programming]]></category>
		<category><![CDATA[hadoop]]></category>
		<category><![CDATA[hadoop-pig]]></category>
		<category><![CDATA[presentations]]></category>

		<guid isPermaLink="false">http://squarecog.wordpress.com/?p=55</guid>
		<description><![CDATA[Ashutosh and I presented at the Pittsburgh Hadoop User Group on Apache Pig. The slide deck goes through a brief into to Pig Latin, then jumps into an explanation of the different join algorithms, and finishes up with some research ideas. A pretty wide-ranging talk, for a diverse audience. Scribd messed up some of the [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=squarecog.wordpress.com&amp;blog=4884813&amp;post=55&amp;subd=squarecog&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>Ashutosh and I presented at the Pittsburgh Hadoop User Group on Apache Pig. The slide deck goes through a brief into to Pig Latin, then jumps into an explanation of the different join algorithms, and finishes up with some research ideas. A pretty wide-ranging talk, for a diverse audience.</p>
<p>Scribd messed up some of the colors, so if you can&#8217;t read some of the text, try downloading the original.</p>
<iframe class="scribd_iframe_embed" src="http://www.scribd.com/embeds/22083049/content?start_page=1&view_mode=&access_key=key-1l6qtqznllnebdt6uylm" data-auto-height="true" scrolling="no" id="scribd_22083049" width="100%" height="500" frameborder="0"></iframe>
<div style="font-size:10px;text-align:center;width:100%"><a href="http://www.scribd.com/doc/22083049">View this document on Scribd</a></div>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/squarecog.wordpress.com/55/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/squarecog.wordpress.com/55/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/squarecog.wordpress.com/55/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/squarecog.wordpress.com/55/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/squarecog.wordpress.com/55/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/squarecog.wordpress.com/55/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/squarecog.wordpress.com/55/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/squarecog.wordpress.com/55/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/squarecog.wordpress.com/55/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/squarecog.wordpress.com/55/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/squarecog.wordpress.com/55/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/squarecog.wordpress.com/55/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/squarecog.wordpress.com/55/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/squarecog.wordpress.com/55/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=squarecog.wordpress.com&amp;blog=4884813&amp;post=55&amp;subd=squarecog&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://squarecog.wordpress.com/2009/11/03/apache-pig-apittsburgh-hadoop-user-group/feed/</wfw:commentRss>
		<slash:comments>1</slash:comments>
	
		<media:content url="http://1.gravatar.com/avatar/7cfa823e92df71d57570952dff52ed2a?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">squarecog</media:title>
		</media:content>
	</item>
	</channel>
</rss>
