-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathProgram.cs
135 lines (121 loc) · 5.76 KB
/
Program.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
using System.Text;
using System.Xml;
namespace Xml2Txt
{
class Program
{
static void Main(string[] args)
{
do
{
try
{
// Read from console, how many files at what approximate size.
int pagesToCreate = 0;
int saveFilesize = 0;
Console.WriteLine("How many files?");
if (Int32.TryParse(Console.ReadLine(), out pagesToCreate))
{
Console.WriteLine("Min Filesize in MB?");
if (Int32.TryParse(Console.ReadLine(), out saveFilesize))
{
saveFilesize *= (1024 * 1024);
}
else
{
Console.WriteLine("Invalid input!");
}
}
else
{
Console.WriteLine("Invalid input!");
}
// Keep track of running totals
int filesize = 0;
int pagesProcessed = 0;
int filesCreated = 0;
Console.WriteLine("Source file?");
StringBuilder sb = new StringBuilder("", 5000);
string path = Path.Combine(Environment.CurrentDirectory, Console.ReadLine());
Console.WriteLine("Enter the name for this data set...");
string dirName = Console.ReadLine();
XmlReader reader = XmlReader.Create(path);
reader.ReadToDescendant("page");
if (pagesToCreate > 0)
{
if (!Directory.Exists(dirName))
{
Directory.CreateDirectory(dirName);
}
do
{
XmlDocument doc = new XmlDocument();
doc.LoadXml(reader.ReadOuterXml());
XmlNode pageNode = doc.DocumentElement;
if (pageNode != null)
{
pagesProcessed++;
foreach (XmlNode revisonNode in pageNode.ChildNodes)
{
if (revisonNode.Name == "revision")
{
foreach (XmlNode textNode in revisonNode.ChildNodes)
{
if (textNode.Name == "text")
{
int thisFileSize = Int32.Parse(textNode.Attributes["bytes"].Value);
filesize += thisFileSize;
sb.Append(textNode.InnerText);
if (filesize >= saveFilesize)
{
string outputFile = dirName + "_" + filesCreated++;
CreateTxtFile(dirName, outputFile, sb.ToString(), (filesize / 1024 / 1024));
sb = new StringBuilder("", 5000);
filesize = 0;
if (filesCreated >= pagesToCreate)
{
goto LoopEnd;
}
}
}
}
}
}
}
}
while (reader.ReadToNextSibling("page"));
if (filesize > 0)
{
// if anything left write that to a file.
string outputFile = dirName + "_" + filesCreated++;
CreateTxtFile(dirName, outputFile, sb.ToString(), (filesize / 1024 / 1024));
}
}
LoopEnd:
reader.Close();
if (pagesProcessed > 0)
{
Console.WriteLine("Processed:" + pagesProcessed + " wikipedia pages");
Console.WriteLine("Saved:" + filesCreated + " files in " + dirName);
}
}
catch (Exception ex)
{
Console.WriteLine(ex.Message);
}
Console.WriteLine("q to quit, return to run again");
}
while (Console.ReadLine() != "q");
static void CreateTxtFile(string directory, string outputFile, string content, int size)
{
using (FileStream fs = File.Create(directory + "/" + outputFile + ".txt"))
{
byte[] info = new UTF8Encoding(true).GetBytes(content);
// Add some information to the file.
fs.Write(info, 0, info.Length);
}
Console.WriteLine("Created:" + outputFile + ".txt" + " Size:" + size + "MB");
}
}
}
}