-
Notifications
You must be signed in to change notification settings - Fork 148
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Graph generator with HDFS support #27
base: master
Are you sure you want to change the base?
Changes from all commits
382c7d2
06b8c95
68d78b3
0ccc64c
5c97678
b8efadc
a73037e
3e503e2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
/* | ||
* Copyright 2012 Twitter, Inc. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this | ||
* file except in compliance with the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software distributed | ||
* under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | ||
* CONDITIONS OF ANY KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations under the License. | ||
*/ | ||
|
||
package com.twitter.pers.graph_generator; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please add copyright/license header information, see other files as an example. |
||
|
||
import java.io.BufferedOutputStream; | ||
import java.io.DataOutputStream; | ||
import java.io.FileOutputStream; | ||
import java.io.IOException; | ||
|
||
/** | ||
* Efficient parallel output of edges into a binary edge list format. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. pls write a couple of lines about this "edge list format" |
||
* @author Aapo Kyrola, [email protected], [email protected] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we generally have not listed author names in the past in the source code. So, pls leave it out. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agreed, please ensure that you use your twitter email as the author too (e.g., git commit --amend --author="abc [email protected]" |
||
*/ | ||
public class EdgeListOutput implements GraphOutput { | ||
|
||
private String fileNamePrefix; | ||
|
||
static int partSeq = 0; | ||
|
||
public EdgeListOutput(String fileNamePrefix) { | ||
this.fileNamePrefix = fileNamePrefix; | ||
} | ||
|
||
@Override | ||
public void addEdges(int[] from, int[] to) { | ||
try { | ||
DataOutputStream dos = partitionOut.get(); | ||
int n = from.length; | ||
for(int i=0; i<n; i++) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 'n' is not really needed |
||
dos.writeInt(Integer.reverseBytes(from[i])); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. any particular rationale to "reverseBytes" ? |
||
dos.writeInt(Integer.reverseBytes(to[i])); | ||
} | ||
} catch (IOException ioe) { | ||
throw new RuntimeException(ioe); | ||
} | ||
} | ||
|
||
public void finishUp() { | ||
try { | ||
partitionOut.get().close(); | ||
} catch (IOException ioe) { | ||
throw new RuntimeException(ioe); | ||
} | ||
} | ||
|
||
/* Each thread will have a local partition */ | ||
private ThreadLocal<DataOutputStream> partitionOut = new ThreadLocal<DataOutputStream>() { | ||
@Override | ||
protected DataOutputStream initialValue() { | ||
try { | ||
int thisPartId; | ||
synchronized (this) { | ||
thisPartId = partSeq++; | ||
} | ||
|
||
String fileName = fileNamePrefix + "-part" + thisPartId; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. prefer "-part-" (k.e., extra '-' after "part") |
||
return new DataOutputStream(new BufferedOutputStream(new FileOutputStream(fileName))); | ||
} catch (Exception err) { | ||
err.printStackTrace(); | ||
throw new RuntimeException(err); | ||
} | ||
} | ||
}; | ||
|
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
/* | ||
* Copyright 2012 Twitter, Inc. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this | ||
* file except in compliance with the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software distributed | ||
* under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | ||
* CONDITIONS OF ANY KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations under the License. | ||
*/ | ||
|
||
package com.twitter.pers.graph_generator; | ||
|
||
/** | ||
* @author Aapo Kyrola, [email protected], [email protected] | ||
*/ | ||
public interface GraphOutput { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. pls add comments on what this interface is, as well as some doc for each method. |
||
|
||
void addEdges(int[] from, int[] to); | ||
|
||
void finishUp(); | ||
|
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
/* | ||
* Copyright 2012 Twitter, Inc. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this | ||
* file except in compliance with the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software distributed | ||
* under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | ||
* CONDITIONS OF ANY KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations under the License. | ||
*/ | ||
|
||
package com.twitter.pers.graph_generator; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please add copyright/license header information, see other files as an example. |
||
|
||
import org.apache.hadoop.conf.Configuration; | ||
import org.apache.hadoop.fs.FSDataOutputStream; | ||
import org.apache.hadoop.fs.FileSystem; | ||
import org.apache.hadoop.fs.Path; | ||
|
||
import java.io.IOException; | ||
|
||
/** | ||
* Outputs edges into HDFS in tab delimited edge list format | ||
* @author Aapo Kyrola, [email protected], [email protected] | ||
*/ | ||
public class HDFSEdgeListOutput implements GraphOutput { | ||
|
||
private String fileNamePrefix; | ||
|
||
static int partSeq = 0; | ||
|
||
public HDFSEdgeListOutput(String fileNamePrefix) { | ||
this.fileNamePrefix = fileNamePrefix; | ||
System.out.println("Using HDFS output: " + fileNamePrefix); | ||
} | ||
|
||
@Override | ||
public void addEdges(int[] from, int[] to) { | ||
try { | ||
FSDataOutputStream dos = partitionOut.get(); | ||
int n = from.length; | ||
StringBuffer sb = new StringBuffer(from.length * 32); | ||
for(int i=0; i<n; i++) { | ||
sb.append(from[i]); | ||
sb.append("\t"); | ||
sb.append(to[i]); | ||
sb.append("\n"); | ||
} | ||
dos.write(sb.toString().getBytes()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. seems like the binary format here would be different than the one above (in EdgeListOutput). Is that correct? If so, it would be nice to have the same binary format. If it would be the same, then it would be nice to use the same code to do the actual writing if possible. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On Tue, Sep 25, 2012 at 3:10 PM, Pankaj Gupta [email protected]:
The binary format was committed in mistake, as it is for generating graphs There were many mistakes in my pull request, for example I don't Aapo |
||
} catch (Exception ioe) { | ||
throw new RuntimeException(ioe); | ||
} | ||
} | ||
|
||
public void finishUp() { | ||
try { | ||
partitionOut.get().close(); | ||
} catch (IOException ioe) { | ||
throw new RuntimeException(ioe); | ||
} | ||
} | ||
|
||
/* Each thread will have a local partition */ | ||
private ThreadLocal<FSDataOutputStream> partitionOut = new ThreadLocal<FSDataOutputStream>() { | ||
@Override | ||
protected FSDataOutputStream initialValue() { | ||
try { | ||
int thisPartId; | ||
synchronized (this) { | ||
thisPartId = partSeq++; | ||
} | ||
|
||
String hadoopHome = System.getProperty("HADOOP_HOME"); | ||
if (hadoopHome == null) hadoopHome = System.getenv("HADOOP_HOME"); | ||
|
||
if (hadoopHome == null) { | ||
throw new IllegalArgumentException("You need to specify environment variable or JVM option HADOOP_HOME!"); | ||
} | ||
|
||
Configuration conf = new Configuration(); | ||
conf.addResource(new Path(hadoopHome + "/conf/core-site.xml")); | ||
conf.addResource(new Path(hadoopHome + "/conf/hdfs-site.xml")); | ||
|
||
|
||
String fileName = fileNamePrefix + "-part" + thisPartId; | ||
FileSystem fs = FileSystem.get(conf); | ||
return fs.create(new Path(fileName)); | ||
} catch (Exception err) { | ||
err.printStackTrace(); | ||
throw new RuntimeException(err); | ||
} | ||
} | ||
}; | ||
|
||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this equivalent to -Xmx4096m ? If so, kind of big to have.