-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathExplorer.cs
More file actions
110 lines (101 loc) · 3.99 KB
/
Explorer.cs
File metadata and controls
110 lines (101 loc) · 3.99 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using HAP=HtmlAgilityPack;
namespace WebCrawler
{
public class Explorer
{
/*
private string fileName;
private string url;
private Dictionary<string,bool> queryWords;
private WordProcessor wordProcessor;
public Explorer(string url,string fileName, List<string> queryWords)
{
this.fileName = fileName;
this.url = url;
this.queryWords = new Dictionary<string,bool>();
foreach (var word in queryWords)
{
this.queryWords.Add(word, false);
}
this.wordProcessor = new WordProcessor(fileName);
}
public void explore()
{
while (!wordProcessor.isEOF()){
foreach(var queryWord in queryWords)
{
queryWords[queryWord.Key] = queryWords[queryWord.Key] || queryWord.Key == wordProcessor.getWord();
}
wordProcessor.advanceWord();
}
}*/
public static void getCrawlResult(string folderName,ref List<Crawler.crawlElement> retval)
{
retval = new List<Crawler.crawlElement>();
WordProcessor wp = new WordProcessor(Configuration.getCrawlResultLocation() + folderName + ".result");
retval = new List<Crawler.crawlElement>();
while (!wp.isEOF()){
string url = wp.getWord();
wp.advanceWord();
string fileName = wp.getWord();
wp.advanceWord();
wp.advanceWord();
retval.Add(new Crawler.crawlElement(url,fileName));
}
wp.closeFile();
}
public static List<Crawler.crawlElement> explore(string folderName,string query,int maxSize)
{
List<string> qWords = query.Split(' ').ToList();
List<Crawler.crawlElement> crawlResults = new List<Crawler.crawlElement>();
getCrawlResult(folderName, ref crawlResults);
for(int i = 0; i < crawlResults.Count; ++i)
{
crawlResults[i].initListBool(qWords.Count);
string indexPath = Configuration.getIndexLocation() + folderName + "/" + crawlResults[i].fileName;
indexPath = indexPath.Remove(indexPath.Length - 5); indexPath += ".index";
WordProcessor wp = new WordProcessor(indexPath);
while (!wp.isEOF())
{
for (int j = 0; j < qWords.Count; ++j)
{
crawlResults[i].haveKey[j] = crawlResults[i].haveKey[j] || (wp.getWord().ToUpper() == qWords[j].ToUpper());
}
wp.advanceWord();
}
for (int j = 0; j < qWords.Count; ++j)
{
crawlResults[i].haveKey[j] = crawlResults[i].haveKey[j] || (wp.getWord().ToUpper() == qWords[j].ToUpper());
}
wp.closeFile();
}
filterResult(ref crawlResults);
return crawlResults;
}
public string getContent(ref HAP.HtmlDocument doc,ref List<string> qWords)
{
return "";
}
public static void filterResult(ref List<Crawler.crawlElement> list)
{
List<Crawler.crawlElement> retval = new List<Crawler.crawlElement>();
foreach(var lElmt in list)
{
if (atLeastOneTrue(lElmt))
retval.Add(lElmt);
}
list = retval;
}
public static bool atLeastOneTrue(Crawler.crawlElement list)
{
foreach(var keyElmt in list.haveKey){
if (keyElmt == true) return true;
}
return false;
}
}
}