|
| 1 | +package city_analysis; |
| 2 | + |
| 3 | +import java.io.*; |
| 4 | +import java.net.*; |
| 5 | +import java.nio.charset.StandardCharsets; |
| 6 | +import java.nio.file.*; |
| 7 | +import java.time.LocalDateTime; |
| 8 | +import java.time.format.DateTimeFormatter; |
| 9 | +import java.util.*; |
| 10 | +import java.util.regex.*; |
| 11 | +import javax.net.ssl.*; |
| 12 | +import javax.xml.parsers.*; |
| 13 | +import org.w3c.dom.*; |
| 14 | + |
| 15 | +/** |
| 16 | + * @author Max Rupplin |
| 17 | + * |
| 18 | + * @date June 23 2026 |
| 19 | + * |
| 20 | + * CityAnalysisCrawler — Aware API approach for crawling property/deeds websites. |
| 21 | + * Follows links, stores raw HTML/XML/CSV to /raw/<datetime>/, respects depth and rate limits. |
| 22 | + * Driven by city-analysis-config.xml crawl-options. |
| 23 | + */ |
| 24 | +public class CityAnalysisCrawler |
| 25 | +{ |
| 26 | + protected String hash = "0xCA717018470E914C"; |
| 27 | + |
| 28 | + protected static final String CONFIG_PATH = "source/city/analysis/city-analysis-config.xml"; |
| 29 | + |
| 30 | + protected String rawDir = "source/city/analysis/raw/"; |
| 31 | + protected boolean followLinks = true; |
| 32 | + protected int maxDepth = 3; |
| 33 | + protected int maxPages = 50; |
| 34 | + protected Set<String> acceptedTypes = new HashSet<>(Arrays.asList("html", "xml", "csv")); |
| 35 | + protected long delayMs = 2000; |
| 36 | + protected int timeoutMs = 15000; |
| 37 | + protected String userAgent = "NitroWebExpress/CityAnalysis 1.0"; |
| 38 | + |
| 39 | + protected Set<String> visited = new HashSet<>(); |
| 40 | + protected List<Path> storedFiles = new ArrayList<>(); |
| 41 | + protected String sessionDir; |
| 42 | + |
| 43 | + public CityAnalysisCrawler() |
| 44 | + { |
| 45 | + loadConfig(); |
| 46 | + sessionDir = rawDir + LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd/HH-mm-ss")) + "/"; |
| 47 | + } |
| 48 | + |
| 49 | + protected void loadConfig() |
| 50 | + { |
| 51 | + try |
| 52 | + { |
| 53 | + Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(new File(CONFIG_PATH)); |
| 54 | + doc.getDocumentElement().normalize(); |
| 55 | + |
| 56 | + NodeList crawlNodes = doc.getElementsByTagName("crawl-options"); |
| 57 | + if (crawlNodes.getLength() > 0) |
| 58 | + { |
| 59 | + Element crawl = (Element) crawlNodes.item(0); |
| 60 | + rawDir = getTag(crawl, "raw-dir"); |
| 61 | + followLinks = "true".equals(getTag(crawl, "follow-links")); |
| 62 | + maxDepth = Integer.parseInt(getTag(crawl, "max-depth")); |
| 63 | + maxPages = Integer.parseInt(getTag(crawl, "max-pages")); |
| 64 | + delayMs = Long.parseLong(getTag(crawl, "delay-between-requests-ms")); |
| 65 | + String types = getTag(crawl, "accepted-types"); |
| 66 | + if (!types.isEmpty()) acceptedTypes = new HashSet<>(Arrays.asList(types.split(","))); |
| 67 | + } |
| 68 | + |
| 69 | + NodeList connNodes = doc.getElementsByTagName("connection"); |
| 70 | + if (connNodes.getLength() > 0) |
| 71 | + { |
| 72 | + Element conn = (Element) connNodes.item(0); |
| 73 | + timeoutMs = Integer.parseInt(getTag(conn, "timeout-ms")); |
| 74 | + userAgent = getTag(conn, "user-agent"); |
| 75 | + } |
| 76 | + |
| 77 | + System.out.println("-- : [CityAnalysisCrawler] Config loaded. maxDepth:" + maxDepth + " maxPages:" + maxPages + " delay:" + delayMs + "ms"); |
| 78 | + } |
| 79 | + catch (Exception e) |
| 80 | + { |
| 81 | + System.err.println("-- : [CityAnalysisCrawler] Config load failed: " + e.getMessage()); |
| 82 | + } |
| 83 | + } |
| 84 | + |
| 85 | + /** |
| 86 | + * Crawl starting from a set of seed URLs |
| 87 | + */ |
| 88 | + public List<Path> crawl(String... seedUrls) |
| 89 | + { |
| 90 | + try { Files.createDirectories(Paths.get(sessionDir)); } |
| 91 | + catch (Exception e) { /* ignore */ } |
| 92 | + |
| 93 | + Deque<String[]> queue = new ArrayDeque<>(); // [url, depth] |
| 94 | + for (String url : seedUrls) |
| 95 | + { |
| 96 | + queue.add(new String[]{url, "0"}); |
| 97 | + } |
| 98 | + |
| 99 | + while (!queue.isEmpty() && visited.size() < maxPages) |
| 100 | + { |
| 101 | + String[] item = queue.poll(); |
| 102 | + String url = item[0]; |
| 103 | + int depth = Integer.parseInt(item[1]); |
| 104 | + |
| 105 | + if (visited.contains(url) || depth > maxDepth) continue; |
| 106 | + visited.add(url); |
| 107 | + |
| 108 | + System.out.println("-- : [CityAnalysisCrawler] Crawling (" + depth + "/" + maxDepth + "): " + url); |
| 109 | + |
| 110 | + String content = fetch(url); |
| 111 | + if (content == null) continue; |
| 112 | + |
| 113 | + // Determine content type and store |
| 114 | + String ext = inferExtension(url, content); |
| 115 | + if (acceptedTypes.contains(ext)) |
| 116 | + { |
| 117 | + Path stored = storeRaw(url, content, ext); |
| 118 | + if (stored != null) storedFiles.add(stored); |
| 119 | + } |
| 120 | + |
| 121 | + // Extract and queue links if following |
| 122 | + if (followLinks && depth < maxDepth) |
| 123 | + { |
| 124 | + List<String> links = extractLinks(url, content); |
| 125 | + for (String link : links) |
| 126 | + { |
| 127 | + if (!visited.contains(link)) |
| 128 | + { |
| 129 | + queue.add(new String[]{link, String.valueOf(depth + 1)}); |
| 130 | + } |
| 131 | + } |
| 132 | + } |
| 133 | + |
| 134 | + // Rate limiting |
| 135 | + try { Thread.sleep(delayMs); } catch (InterruptedException e) { break; } |
| 136 | + } |
| 137 | + |
| 138 | + System.out.println("-- : [CityAnalysisCrawler] Crawl complete. Pages:" + visited.size() + " Files stored:" + storedFiles.size()); |
| 139 | + return storedFiles; |
| 140 | + } |
| 141 | + |
| 142 | + /** |
| 143 | + * Fetch URL content with SSL support |
| 144 | + */ |
| 145 | + protected String fetch(String urlStr) |
| 146 | + { |
| 147 | + try |
| 148 | + { |
| 149 | + URL url = new URL(urlStr); |
| 150 | + HttpURLConnection conn; |
| 151 | + |
| 152 | + if ("https".equalsIgnoreCase(url.getProtocol())) |
| 153 | + { |
| 154 | + HttpsURLConnection httpsConn = (HttpsURLConnection) url.openConnection(); |
| 155 | + TrustManager[] trustAll = new TrustManager[]{ |
| 156 | + new javax.net.ssl.X509TrustManager() |
| 157 | + { |
| 158 | + public java.security.cert.X509Certificate[] getAcceptedIssuers() { return null; } |
| 159 | + public void checkClientTrusted(java.security.cert.X509Certificate[] c, String a) {} |
| 160 | + public void checkServerTrusted(java.security.cert.X509Certificate[] c, String a) {} |
| 161 | + } |
| 162 | + }; |
| 163 | + SSLContext ctx = SSLContext.getInstance("TLS"); |
| 164 | + ctx.init(null, trustAll, new java.security.SecureRandom()); |
| 165 | + httpsConn.setSSLSocketFactory(ctx.getSocketFactory()); |
| 166 | + httpsConn.setHostnameVerifier((h, s) -> true); |
| 167 | + conn = httpsConn; |
| 168 | + } |
| 169 | + else |
| 170 | + { |
| 171 | + conn = (HttpURLConnection) url.openConnection(); |
| 172 | + } |
| 173 | + |
| 174 | + conn.setRequestMethod("GET"); |
| 175 | + conn.setConnectTimeout(timeoutMs); |
| 176 | + conn.setReadTimeout(timeoutMs); |
| 177 | + conn.setRequestProperty("User-Agent", userAgent); |
| 178 | + conn.setInstanceFollowRedirects(true); |
| 179 | + |
| 180 | + int code = conn.getResponseCode(); |
| 181 | + if (code == 200) |
| 182 | + { |
| 183 | + try (BufferedReader reader = new BufferedReader(new InputStreamReader(conn.getInputStream(), StandardCharsets.UTF_8))) |
| 184 | + { |
| 185 | + StringBuilder sb = new StringBuilder(); |
| 186 | + String line; |
| 187 | + while ((line = reader.readLine()) != null) sb.append(line).append("\n"); |
| 188 | + return sb.toString(); |
| 189 | + } |
| 190 | + } |
| 191 | + else |
| 192 | + { |
| 193 | + System.err.println("-- : [CityAnalysisCrawler] HTTP " + code + " from " + urlStr); |
| 194 | + } |
| 195 | + } |
| 196 | + catch (Exception e) |
| 197 | + { |
| 198 | + System.err.println("-- : [CityAnalysisCrawler] Fetch error: " + urlStr + " — " + e.getMessage()); |
| 199 | + } |
| 200 | + return null; |
| 201 | + } |
| 202 | + |
| 203 | + /** |
| 204 | + * Store raw content to /raw/<datetime>/<filename> |
| 205 | + */ |
| 206 | + protected Path storeRaw(String url, String content, String ext) |
| 207 | + { |
| 208 | + try |
| 209 | + { |
| 210 | + String filename = url.replaceAll("https?://", "").replaceAll("[^a-zA-Z0-9.-]", "_"); |
| 211 | + if (filename.length() > 100) filename = filename.substring(0, 100); |
| 212 | + filename += "." + ext; |
| 213 | + |
| 214 | + Path outPath = Paths.get(sessionDir, filename); |
| 215 | + Files.writeString(outPath, content); |
| 216 | + System.out.println("-- : [CityAnalysisCrawler] Stored: " + outPath + " (" + content.length() + " chars)"); |
| 217 | + return outPath; |
| 218 | + } |
| 219 | + catch (Exception e) |
| 220 | + { |
| 221 | + System.err.println("-- : [CityAnalysisCrawler] Store error: " + e.getMessage()); |
| 222 | + return null; |
| 223 | + } |
| 224 | + } |
| 225 | + |
| 226 | + /** |
| 227 | + * Extract links from HTML content, resolve relative to base URL |
| 228 | + */ |
| 229 | + protected List<String> extractLinks(String baseUrl, String content) |
| 230 | + { |
| 231 | + List<String> links = new ArrayList<>(); |
| 232 | + Pattern pattern = Pattern.compile("href=[\"']([^\"'#]+)[\"']", Pattern.CASE_INSENSITIVE); |
| 233 | + Matcher m = pattern.matcher(content); |
| 234 | + String baseHost; |
| 235 | + try { baseHost = new URL(baseUrl).getHost(); } catch (Exception e) { return links; } |
| 236 | + |
| 237 | + while (m.find()) |
| 238 | + { |
| 239 | + String href = m.group(1).trim(); |
| 240 | + try |
| 241 | + { |
| 242 | + URL resolved = new URL(new URL(baseUrl), href); |
| 243 | + // Stay on same host |
| 244 | + if (resolved.getHost().equals(baseHost)) |
| 245 | + { |
| 246 | + links.add(resolved.toString()); |
| 247 | + } |
| 248 | + } |
| 249 | + catch (Exception e) { /* skip malformed */ } |
| 250 | + } |
| 251 | + return links; |
| 252 | + } |
| 253 | + |
| 254 | + /** |
| 255 | + * Infer file extension from URL or content |
| 256 | + */ |
| 257 | + protected String inferExtension(String url, String content) |
| 258 | + { |
| 259 | + if (url.endsWith(".csv") || content.startsWith("\"") && content.contains(",")) return "csv"; |
| 260 | + if (url.endsWith(".xml") || content.trim().startsWith("<?xml")) return "xml"; |
| 261 | + return "html"; |
| 262 | + } |
| 263 | + |
| 264 | + public List<Path> getStoredFiles() |
| 265 | + { |
| 266 | + return storedFiles; |
| 267 | + } |
| 268 | + |
| 269 | + protected String getTag(Element parent, String tag) |
| 270 | + { |
| 271 | + NodeList nodes = parent.getElementsByTagName(tag); |
| 272 | + return nodes.getLength() > 0 ? nodes.item(0).getTextContent().trim() : ""; |
| 273 | + } |
| 274 | +} |
0 commit comments