Skip to content

Commit 7937fa5

Browse files
committed
Executive System Touch M5+ 75
1 parent 5f66e3f commit 7937fa5

11 files changed

Lines changed: 4372 additions & 11 deletions
Lines changed: 274 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,274 @@
1+
package city_analysis;
2+
3+
import java.io.*;
4+
import java.net.*;
5+
import java.nio.charset.StandardCharsets;
6+
import java.nio.file.*;
7+
import java.time.LocalDateTime;
8+
import java.time.format.DateTimeFormatter;
9+
import java.util.*;
10+
import java.util.regex.*;
11+
import javax.net.ssl.*;
12+
import javax.xml.parsers.*;
13+
import org.w3c.dom.*;
14+
15+
/**
16+
* @author Max Rupplin
17+
*
18+
* @date June 23 2026
19+
*
20+
* CityAnalysisCrawler — Aware API approach for crawling property/deeds websites.
21+
* Follows links, stores raw HTML/XML/CSV to /raw/<datetime>/, respects depth and rate limits.
22+
* Driven by city-analysis-config.xml crawl-options.
23+
*/
24+
public class CityAnalysisCrawler
25+
{
26+
protected String hash = "0xCA717018470E914C";
27+
28+
protected static final String CONFIG_PATH = "source/city/analysis/city-analysis-config.xml";
29+
30+
protected String rawDir = "source/city/analysis/raw/";
31+
protected boolean followLinks = true;
32+
protected int maxDepth = 3;
33+
protected int maxPages = 50;
34+
protected Set<String> acceptedTypes = new HashSet<>(Arrays.asList("html", "xml", "csv"));
35+
protected long delayMs = 2000;
36+
protected int timeoutMs = 15000;
37+
protected String userAgent = "NitroWebExpress/CityAnalysis 1.0";
38+
39+
protected Set<String> visited = new HashSet<>();
40+
protected List<Path> storedFiles = new ArrayList<>();
41+
protected String sessionDir;
42+
43+
public CityAnalysisCrawler()
44+
{
45+
loadConfig();
46+
sessionDir = rawDir + LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd/HH-mm-ss")) + "/";
47+
}
48+
49+
protected void loadConfig()
50+
{
51+
try
52+
{
53+
Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(new File(CONFIG_PATH));
54+
doc.getDocumentElement().normalize();
55+
56+
NodeList crawlNodes = doc.getElementsByTagName("crawl-options");
57+
if (crawlNodes.getLength() > 0)
58+
{
59+
Element crawl = (Element) crawlNodes.item(0);
60+
rawDir = getTag(crawl, "raw-dir");
61+
followLinks = "true".equals(getTag(crawl, "follow-links"));
62+
maxDepth = Integer.parseInt(getTag(crawl, "max-depth"));
63+
maxPages = Integer.parseInt(getTag(crawl, "max-pages"));
64+
delayMs = Long.parseLong(getTag(crawl, "delay-between-requests-ms"));
65+
String types = getTag(crawl, "accepted-types");
66+
if (!types.isEmpty()) acceptedTypes = new HashSet<>(Arrays.asList(types.split(",")));
67+
}
68+
69+
NodeList connNodes = doc.getElementsByTagName("connection");
70+
if (connNodes.getLength() > 0)
71+
{
72+
Element conn = (Element) connNodes.item(0);
73+
timeoutMs = Integer.parseInt(getTag(conn, "timeout-ms"));
74+
userAgent = getTag(conn, "user-agent");
75+
}
76+
77+
System.out.println("-- : [CityAnalysisCrawler] Config loaded. maxDepth:" + maxDepth + " maxPages:" + maxPages + " delay:" + delayMs + "ms");
78+
}
79+
catch (Exception e)
80+
{
81+
System.err.println("-- : [CityAnalysisCrawler] Config load failed: " + e.getMessage());
82+
}
83+
}
84+
85+
/**
86+
* Crawl starting from a set of seed URLs
87+
*/
88+
public List<Path> crawl(String... seedUrls)
89+
{
90+
try { Files.createDirectories(Paths.get(sessionDir)); }
91+
catch (Exception e) { /* ignore */ }
92+
93+
Deque<String[]> queue = new ArrayDeque<>(); // [url, depth]
94+
for (String url : seedUrls)
95+
{
96+
queue.add(new String[]{url, "0"});
97+
}
98+
99+
while (!queue.isEmpty() && visited.size() < maxPages)
100+
{
101+
String[] item = queue.poll();
102+
String url = item[0];
103+
int depth = Integer.parseInt(item[1]);
104+
105+
if (visited.contains(url) || depth > maxDepth) continue;
106+
visited.add(url);
107+
108+
System.out.println("-- : [CityAnalysisCrawler] Crawling (" + depth + "/" + maxDepth + "): " + url);
109+
110+
String content = fetch(url);
111+
if (content == null) continue;
112+
113+
// Determine content type and store
114+
String ext = inferExtension(url, content);
115+
if (acceptedTypes.contains(ext))
116+
{
117+
Path stored = storeRaw(url, content, ext);
118+
if (stored != null) storedFiles.add(stored);
119+
}
120+
121+
// Extract and queue links if following
122+
if (followLinks && depth < maxDepth)
123+
{
124+
List<String> links = extractLinks(url, content);
125+
for (String link : links)
126+
{
127+
if (!visited.contains(link))
128+
{
129+
queue.add(new String[]{link, String.valueOf(depth + 1)});
130+
}
131+
}
132+
}
133+
134+
// Rate limiting
135+
try { Thread.sleep(delayMs); } catch (InterruptedException e) { break; }
136+
}
137+
138+
System.out.println("-- : [CityAnalysisCrawler] Crawl complete. Pages:" + visited.size() + " Files stored:" + storedFiles.size());
139+
return storedFiles;
140+
}
141+
142+
/**
143+
* Fetch URL content with SSL support
144+
*/
145+
protected String fetch(String urlStr)
146+
{
147+
try
148+
{
149+
URL url = new URL(urlStr);
150+
HttpURLConnection conn;
151+
152+
if ("https".equalsIgnoreCase(url.getProtocol()))
153+
{
154+
HttpsURLConnection httpsConn = (HttpsURLConnection) url.openConnection();
155+
TrustManager[] trustAll = new TrustManager[]{
156+
new javax.net.ssl.X509TrustManager()
157+
{
158+
public java.security.cert.X509Certificate[] getAcceptedIssuers() { return null; }
159+
public void checkClientTrusted(java.security.cert.X509Certificate[] c, String a) {}
160+
public void checkServerTrusted(java.security.cert.X509Certificate[] c, String a) {}
161+
}
162+
};
163+
SSLContext ctx = SSLContext.getInstance("TLS");
164+
ctx.init(null, trustAll, new java.security.SecureRandom());
165+
httpsConn.setSSLSocketFactory(ctx.getSocketFactory());
166+
httpsConn.setHostnameVerifier((h, s) -> true);
167+
conn = httpsConn;
168+
}
169+
else
170+
{
171+
conn = (HttpURLConnection) url.openConnection();
172+
}
173+
174+
conn.setRequestMethod("GET");
175+
conn.setConnectTimeout(timeoutMs);
176+
conn.setReadTimeout(timeoutMs);
177+
conn.setRequestProperty("User-Agent", userAgent);
178+
conn.setInstanceFollowRedirects(true);
179+
180+
int code = conn.getResponseCode();
181+
if (code == 200)
182+
{
183+
try (BufferedReader reader = new BufferedReader(new InputStreamReader(conn.getInputStream(), StandardCharsets.UTF_8)))
184+
{
185+
StringBuilder sb = new StringBuilder();
186+
String line;
187+
while ((line = reader.readLine()) != null) sb.append(line).append("\n");
188+
return sb.toString();
189+
}
190+
}
191+
else
192+
{
193+
System.err.println("-- : [CityAnalysisCrawler] HTTP " + code + " from " + urlStr);
194+
}
195+
}
196+
catch (Exception e)
197+
{
198+
System.err.println("-- : [CityAnalysisCrawler] Fetch error: " + urlStr + " — " + e.getMessage());
199+
}
200+
return null;
201+
}
202+
203+
/**
204+
* Store raw content to /raw/<datetime>/<filename>
205+
*/
206+
protected Path storeRaw(String url, String content, String ext)
207+
{
208+
try
209+
{
210+
String filename = url.replaceAll("https?://", "").replaceAll("[^a-zA-Z0-9.-]", "_");
211+
if (filename.length() > 100) filename = filename.substring(0, 100);
212+
filename += "." + ext;
213+
214+
Path outPath = Paths.get(sessionDir, filename);
215+
Files.writeString(outPath, content);
216+
System.out.println("-- : [CityAnalysisCrawler] Stored: " + outPath + " (" + content.length() + " chars)");
217+
return outPath;
218+
}
219+
catch (Exception e)
220+
{
221+
System.err.println("-- : [CityAnalysisCrawler] Store error: " + e.getMessage());
222+
return null;
223+
}
224+
}
225+
226+
/**
227+
* Extract links from HTML content, resolve relative to base URL
228+
*/
229+
protected List<String> extractLinks(String baseUrl, String content)
230+
{
231+
List<String> links = new ArrayList<>();
232+
Pattern pattern = Pattern.compile("href=[\"']([^\"'#]+)[\"']", Pattern.CASE_INSENSITIVE);
233+
Matcher m = pattern.matcher(content);
234+
String baseHost;
235+
try { baseHost = new URL(baseUrl).getHost(); } catch (Exception e) { return links; }
236+
237+
while (m.find())
238+
{
239+
String href = m.group(1).trim();
240+
try
241+
{
242+
URL resolved = new URL(new URL(baseUrl), href);
243+
// Stay on same host
244+
if (resolved.getHost().equals(baseHost))
245+
{
246+
links.add(resolved.toString());
247+
}
248+
}
249+
catch (Exception e) { /* skip malformed */ }
250+
}
251+
return links;
252+
}
253+
254+
/**
255+
* Infer file extension from URL or content
256+
*/
257+
protected String inferExtension(String url, String content)
258+
{
259+
if (url.endsWith(".csv") || content.startsWith("\"") && content.contains(",")) return "csv";
260+
if (url.endsWith(".xml") || content.trim().startsWith("<?xml")) return "xml";
261+
return "html";
262+
}
263+
264+
public List<Path> getStoredFiles()
265+
{
266+
return storedFiles;
267+
}
268+
269+
protected String getTag(Element parent, String tag)
270+
{
271+
NodeList nodes = parent.getElementsByTagName(tag);
272+
return nodes.getLength() > 0 ? nodes.item(0).getTextContent().trim() : "";
273+
}
274+
}

0 commit comments

Comments
 (0)