Blockchain-Technology-Lab · LauraAntunes1 · Jun 3, 2026 · Jun 3, 2026 · Jun 3, 2026 · Jul 1, 2026
diff --git a/.gitmodules b/.gitmodules
@@ -1,3 +1,3 @@
 [submodule "ethereum/crawler"]
 	path = ethereum/crawler
-	url = https://github.com/LauraAntunes1/fast-ethereum-crawler
+	url = https://github.com/Blockchain-Technology-Lab/ethereum-network-decentralization.git
diff --git a/ethereum/README.md b/ethereum/README.md
@@ -8,13 +8,13 @@ NOTE: this project uses another project as a submodule, so it needs to be cloned
 ```bash
 git clone --recurse-submodules git@github.com:Blockchain-Technology-Lab/network-decentralization.git
 ```
-Then, to download Nim dependencies and build the crawler, in the 'ethereum/crawler' folder, run:
+Then, to download Nim dependencies and build the crawler, in the `ethereum/crawler` folder, run:
 ```bash
 make -j4 update
 make -j4
 ```
 Please note that it may take some time.  
-In the 'ethereum' folder, install Python dependencies - preferably in a Python virtual environment - using:
+In the `ethereum` folder, install Python dependencies - preferably in a Python virtual environment - using:
 ```bash
 python3 -m pip install -r requirements.txt
 ```
@@ -27,18 +27,18 @@ chmod +x automation.sh
 
 ## How to run the tool
 
-This component of the project analyses the decentralisation of the Ethereum network by exploring it, collecting information about participating nodes and visualising this information through different graphs. To run the tool, please see the 'Requirements and setup instructions' section, then use the following command:
+This component of the project analyses the decentralisation of the Ethereum network by exploring it, collecting information about participating nodes and visualising this information through different graphs. The crawler also captures client-identification data and stores the latest results under `crawler/results/<timestamp>/`. To run the tool, please see the 'Requirements and setup instructions' section, then use the following command:
 ```bash
 ./automation.sh
 ```
-Parameters can be modified in `config.yaml`.
+The crawler workflow runs from the `ethereum` directory, launches `crawler/run.sh --guess --identify`, and reads the newest run from `crawler/results/<timestamp>/`, so the automation script should be started from the `ethereum` folder. Parameters can be modified in `config.yaml`.
 
 ---
 
 ## Workflow Overview
 
-1. **Network Crawling:** `dcrawl.nim` tries to discover all nodes participating in the network. This script comes from the [Fast Ethereum Crawler](https://github.com/cskiraly/fast-ethereum-crawler.git).
-2. **Data Collection:** `collect_geodata.py` collects data about nodes like IP addresses and geolocation.
+1. **Network Crawling:** `crawler/run.sh` launches `dcrawl.nim` with `--guess --identify` so discovery and client-identification data are captured together. The crawler itself comes from the [Fast Ethereum Crawler](https://github.com/cskiraly/fast-ethereum-crawler.git).
+2. **Data Collection:** `collect_geodata.py` collects data about nodes like IP addresses and geolocation from the latest crawler run.
 3. **Data Parsing:** `parse.py` formats raw logs into structured files.
 4. **Visualisation:** `plot.py` generates several graphs.
 5. **Metrics Computation:** `compute_metrics.py` computes decentralization metrics from parsed country/organization distributions.
@@ -49,9 +49,12 @@ Parameters can be modified in `config.yaml`.
 
 ### Core Scripts
 
-- **`dcrawl.nim`**  
+- **`crawler/dcrawl.nim`**  
   Discovers nodes using bootnodes and recursive peer discovery via the Ethereum Discovery protocol. Communicates with peers and gathers peer info.
 
+- **`crawler/run.sh`**  
+  Wrapper for the crawler. The automation script calls it with `--guess --identify` and then processes the newest `crawler/results/<timestamp>/` directory.
+
 - **`parse.py`**  
   Processes raw data (e.g., logs from crawling) into structured formats (JSON, CSV) for easier analysis and plotting.
 
@@ -84,7 +87,7 @@ Parameters can be modified in `config.yaml`.
 
 ## Output
 
-The scripts generate:
+The scripts generate their outputs in the newest `crawler/results/<timestamp>/` directory and also produce:
 - Parsed node datasets (CSV, JSON)
 - Geolocation-enriched data
 - Plots and charts in PNG

diff --git a/ethereum/automation.sh b/ethereum/automation.sh
@@ -4,26 +4,30 @@ source venv/bin/activate # venv is the Python virtual environment where all depe
 
 declare -i DAYS=7
 
-BOOTNODE="enr:-Ku4QHqVeJ8PPICcWk1vSn_XcSkjOkNiTg6Fmii5j6vUQgvzMc9L1goFnLKgXqBJspJjIsB91LTOleFmyWWrFVATGngBh2F0dG5ldHOIAAAAAAAAAACEZXRoMpC1MD8qAAAAAP__________gmlkgnY0gmlwhAMRHkWJc2VjcDI1NmsxoQKLVXFOhp2uX6jeT0DvvDpPcU8FWMjQdR4wMuORMhpX24N1ZHCCIyg"
-
-OUTPUTDIR="output"
-[ ! -d "/path/to/dir" ] && mkdir -p "$OUTPUTDIR"
+CRAWLER_DIR="crawler"
 
 while true
 do
 
-build/dcrawl --bootnode="$BOOTNODE" "$@" # comment this line if new data must not be gathered
-mv -t "$OUTPUTDIR" *.csv # the output is moved to the output directory
+( cd "$CRAWLER_DIR" && ./run.sh --guess --identify "$@" ) # comment this line if new data must not be gathered
+
+latest_run="$(ls -td "$CRAWLER_DIR"/results/* 2>/dev/null | head -1 || true)"
+if [ -z "$latest_run" ]; then
+	echo "Error: no crawler results directory was created." >&2
+	exit 1
+fi
+export OUTPUT_DIRECTORY="$latest_run"
+
 python3 collect_geodata.py
 python3 parse.py
 python3 plot.py
 python3 compute_metrics.py
 
-# The following 2 lines create a folder and move all png and csv files to it
-mkdir "$OUTPUTDIR"/"$(date +%Y-%m-%d)"
-mv -t output/"$(date +%Y-%m-%d)" output/{clients,countries,protocols,organizations,ip,discovery,peerstore}*.csv output/response_length.json output/*.png 2>/dev/null || true
+# Push files to GitHub
+#python3 push_to_github.py # script not on GitHub
+
 echo "The tool will run again in "$DAYS" days."
 
 sleep "$DAYS"d # will repeat the whole process every DAYS days
 
-done
+done
diff --git a/ethereum/compute_metrics.py b/ethereum/compute_metrics.py
@@ -160,7 +160,7 @@ def main():
     network_metrics = hlp.get_metrics_network()
     geo_metrics = hlp.get_metrics_geo()
 
-    output_dir = pathlib.Path(__file__).parent / "output"
+    output_dir = hlp.get_output_directory()
     if not output_dir.exists():
         print(f"Error: Output directory not found at {output_dir}", file=sys.stderr)
         sys.exit(1)

diff --git a/ethereum/config.yaml b/ethereum/config.yaml
@@ -10,9 +10,6 @@ mode:
 execution_parameters:
   concurrency: 100
 
-output_directories:  
-  - ./output
-
 # Metrics for network analysis (organizations)
 network_metrics:
   hhi:

diff --git a/ethereum/crawler b/ethereum/crawler
diff --git a/ethereum/helper.py b/ethereum/helper.py
@@ -1,4 +1,5 @@
 from yaml import safe_load
+import os
 import pathlib
 import requests
 import time
@@ -38,17 +39,16 @@ def get_mode():
 
 def get_output_directory():
     """
-    Reads the config file and retrieves the output directory
-    :returns: a directory that will contain the output files
+    Require the `OUTPUT_DIRECTORY` env var set by the caller
+    :returns: the path provided by `OUTPUT_DIRECTORY`
     """
-    config = get_config_data()
+    try:
+        output_dir = pathlib.Path(os.environ['OUTPUT_DIRECTORY']).resolve()
+    except KeyError:
+        raise RuntimeError("OUTPUT_DIRECTORY environment variable is not set. Set it in automation.sh and retry.")
 
-    output_dir = [pathlib.Path(db_dir).resolve() for db_dir in config['output_directories']][0]
     if not output_dir.is_dir():
-        output_dir.mkdir()
-        for subdir_type in ['osdata', 'geodata']:
-            subdir = output_dir / subdir_type
-            subdir.mkdir()
+        raise FileNotFoundError(f"OUTPUT_DIRECTORY does not exist: {output_dir}")
 
     return output_dir
 

diff --git a/ethereum/parse.py b/ethereum/parse.py
@@ -74,7 +74,8 @@ def analyse_distribution(nodes, layer, mode):
         else: # if the API used for the IP addresses doesn't return any value for the country or the organisation
             geodata_counter["Unknown"] = geodata_counter.get("Unknown", 0) + len(val)
 
-    filename = Path(f'./output/{mode.lower()}_{layer}.csv')
+    output_dir = hlp.get_output_directory()
+    filename = output_dir / f'{mode.lower()}_{layer}.csv'
 
     if filename.is_file():
         df = pd.read_csv(filename)
@@ -88,10 +89,10 @@ def analyse_distribution(nodes, layer, mode):
                 df.loc[rows] = [geodata] + [0]*(columns-1)
                 geodata_in_order.append(geodata_counter[geodata])
         df[datetime.today().strftime('%Y-%m-%d')] = geodata_in_order
-        df.to_csv(f'./output/{mode.lower()}_{layer}.csv', index = False)
+        df.to_csv(filename, index = False)
     else:
         geodata_df = pd.DataFrame.from_dict(geodata_counter, orient='index', columns=[datetime.today().strftime('%Y-%m-%d')])
-        geodata_df.to_csv(f'./output/{mode.lower()}_{layer}.csv', index_label = mode)
+        geodata_df.to_csv(filename, index_label = mode)
 
 def cluster_organizations(layer):
     """
+12 −0		.gitmodules
+51 −0		CLAUDE.md
+2 −2		LICENSE-APACHEv2
+2 −2		LICENSE-MIT
+11 −0		Makefile
+90 −4		README.md
+46 −0		TODO.md
+242 −17		dcrawl.nim
+12 −1		dcrawl.nim.cfg
+104 −0		devp2p_probe.nim
+393 −0		identify.nim
+289 −0		postprocessing/fingerprint.py
+529 −61		postprocessing/plot.py
+126 −41		postprocessing/progress.py
+1 −0		vendor/dnsclient.nim
+1 −1		vendor/nim-eth
+1 −0		vendor/nim-libp2p
+1 −0		vendor/nim-unittest2
+1 −0		vendor/nim-websock