-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLentaArticle.cs
75 lines (74 loc) · 2.75 KB
/
LentaArticle.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
using System;
using System.Collections.Generic;
using System.Text.RegularExpressions;
using System.Threading;
using HtmlAgilityPack;
namespace s2._3
{
class LentaArticle : HtmlDocument
{
HtmlDocument Document;
public LentaArticle(string lnk)
{
Thread.Sleep(new Random().Next(120, 900));
Document = new HtmlWeb().Load(lnk);
}
static private string Clear(string str) => str.Replace('«', '"').Replace('»', '"').Replace(" ", " ").Trim(new char[] { '[', ']' });
public string GetHead()
{
string txt = "";
try
{
txt = Clear(Document.DocumentNode.SelectSingleNode("//meta[1]").GetAttributeValue("content", ""));
txt = txt.Substring(0, txt.IndexOf(':'));
}
catch (Exception e)
{
Console.ForegroundColor = ConsoleColor.Red;
Console.WriteLine(e.Message + " Ошибка в извлечении заголовка статьи.");
Console.ResetColor();
}
return txt;
}
public string GetBody()
{
string txt = "";
try
{
foreach (var a in Document.DocumentNode.SelectNodes("//div[@itemprop = \"articleBody\"]/p"))
txt += Clear(a.InnerText);
}
catch (Exception e)
{
Console.ForegroundColor = ConsoleColor.Red;
Console.WriteLine(e.Message + " Ошибка в извлечении тела статьи.");
Console.ResetColor();
}
return txt;
}
public DateTime GetDateTime()
{
DateTime datetime;
var a = Document.DocumentNode.SelectSingleNode("//time[@class = \"g-date\"]");
datetime = DateTime.Parse(a.GetAttributeValue("datetime", ""));
return datetime;
}
public HashSet<string> GetLinksNext()
{
var lks = new HashSet<string>();
string lnk = "";
foreach (var a in Document.DocumentNode.SelectNodes("//a"))
{
lnk = a.GetAttributeValue("href", "");
if (Regex.IsMatch(lnk, @"\/(articles|news)\/") && !Regex.IsMatch(lnk, @"https|www|http|comments"))
lks.Add("https://lenta.ru" + lnk);
}
return lks;
}
public string GetCategory() => Regex.Match(Document.DocumentNode
.SelectSingleNode("//head/descendant::script[text()[contains(.,\"window.Lenta.bloc_slug = \")]]")
.InnerText, "\"(.*)\"")
.Groups[1]
.Value;
}
}