diff --git a/.gitmodules b/.gitmodules index 21e3ba7..775ce05 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ [submodule "ethereum/crawler"] path = ethereum/crawler - url = https://github.com/LauraAntunes1/fast-ethereum-crawler + url = https://github.com/Blockchain-Technology-Lab/ethereum-network-decentralization.git diff --git a/ethereum/README.md b/ethereum/README.md index 7ce91d7..598592d 100644 --- a/ethereum/README.md +++ b/ethereum/README.md @@ -8,13 +8,13 @@ NOTE: this project uses another project as a submodule, so it needs to be cloned ```bash git clone --recurse-submodules git@github.com:Blockchain-Technology-Lab/network-decentralization.git ``` -Then, to download Nim dependencies and build the crawler, in the 'ethereum/crawler' folder, run: +Then, to download Nim dependencies and build the crawler, in the `ethereum/crawler` folder, run: ```bash make -j4 update make -j4 ``` Please note that it may take some time. -In the 'ethereum' folder, install Python dependencies - preferably in a Python virtual environment - using: +In the `ethereum` folder, install Python dependencies - preferably in a Python virtual environment - using: ```bash python3 -m pip install -r requirements.txt ``` @@ -27,18 +27,18 @@ chmod +x automation.sh ## How to run the tool -This component of the project analyses the decentralisation of the Ethereum network by exploring it, collecting information about participating nodes and visualising this information through different graphs. To run the tool, please see the 'Requirements and setup instructions' section, then use the following command: +This component of the project analyses the decentralisation of the Ethereum network by exploring it, collecting information about participating nodes and visualising this information through different graphs. The crawler also captures client-identification data and stores the latest results under `crawler/results//`. To run the tool, please see the 'Requirements and setup instructions' section, then use the following command: ```bash ./automation.sh ``` -Parameters can be modified in `config.yaml`. +The crawler workflow runs from the `ethereum` directory, launches `crawler/run.sh --guess --identify`, and reads the newest run from `crawler/results//`, so the automation script should be started from the `ethereum` folder. Parameters can be modified in `config.yaml`. --- ## Workflow Overview -1. **Network Crawling:** `dcrawl.nim` tries to discover all nodes participating in the network. This script comes from the [Fast Ethereum Crawler](https://github.com/cskiraly/fast-ethereum-crawler.git). -2. **Data Collection:** `collect_geodata.py` collects data about nodes like IP addresses and geolocation. +1. **Network Crawling:** `crawler/run.sh` launches `dcrawl.nim` with `--guess --identify` so discovery and client-identification data are captured together. The crawler itself comes from the [Fast Ethereum Crawler](https://github.com/cskiraly/fast-ethereum-crawler.git). +2. **Data Collection:** `collect_geodata.py` collects data about nodes like IP addresses and geolocation from the latest crawler run. 3. **Data Parsing:** `parse.py` formats raw logs into structured files. 4. **Visualisation:** `plot.py` generates several graphs. 5. **Metrics Computation:** `compute_metrics.py` computes decentralization metrics from parsed country/organization distributions. @@ -49,9 +49,12 @@ Parameters can be modified in `config.yaml`. ### Core Scripts -- **`dcrawl.nim`** +- **`crawler/dcrawl.nim`** Discovers nodes using bootnodes and recursive peer discovery via the Ethereum Discovery protocol. Communicates with peers and gathers peer info. +- **`crawler/run.sh`** + Wrapper for the crawler. The automation script calls it with `--guess --identify` and then processes the newest `crawler/results//` directory. + - **`parse.py`** Processes raw data (e.g., logs from crawling) into structured formats (JSON, CSV) for easier analysis and plotting. @@ -84,7 +87,7 @@ Parameters can be modified in `config.yaml`. ## Output -The scripts generate: +The scripts generate their outputs in the newest `crawler/results//` directory and also produce: - Parsed node datasets (CSV, JSON) - Geolocation-enriched data - Plots and charts in PNG diff --git a/ethereum/automation.sh b/ethereum/automation.sh index 63541dd..4dc1159 100755 --- a/ethereum/automation.sh +++ b/ethereum/automation.sh @@ -4,26 +4,30 @@ source venv/bin/activate # venv is the Python virtual environment where all depe declare -i DAYS=7 -BOOTNODE="enr:-Ku4QHqVeJ8PPICcWk1vSn_XcSkjOkNiTg6Fmii5j6vUQgvzMc9L1goFnLKgXqBJspJjIsB91LTOleFmyWWrFVATGngBh2F0dG5ldHOIAAAAAAAAAACEZXRoMpC1MD8qAAAAAP__________gmlkgnY0gmlwhAMRHkWJc2VjcDI1NmsxoQKLVXFOhp2uX6jeT0DvvDpPcU8FWMjQdR4wMuORMhpX24N1ZHCCIyg" - -OUTPUTDIR="output" -[ ! -d "/path/to/dir" ] && mkdir -p "$OUTPUTDIR" +CRAWLER_DIR="crawler" while true do -build/dcrawl --bootnode="$BOOTNODE" "$@" # comment this line if new data must not be gathered -mv -t "$OUTPUTDIR" *.csv # the output is moved to the output directory +( cd "$CRAWLER_DIR" && ./run.sh --guess --identify "$@" ) # comment this line if new data must not be gathered + +latest_run="$(ls -td "$CRAWLER_DIR"/results/* 2>/dev/null | head -1 || true)" +if [ -z "$latest_run" ]; then + echo "Error: no crawler results directory was created." >&2 + exit 1 +fi +export OUTPUT_DIRECTORY="$latest_run" + python3 collect_geodata.py python3 parse.py python3 plot.py python3 compute_metrics.py -# The following 2 lines create a folder and move all png and csv files to it -mkdir "$OUTPUTDIR"/"$(date +%Y-%m-%d)" -mv -t output/"$(date +%Y-%m-%d)" output/{clients,countries,protocols,organizations,ip,discovery,peerstore}*.csv output/response_length.json output/*.png 2>/dev/null || true +# Push files to GitHub +#python3 push_to_github.py # script not on GitHub + echo "The tool will run again in "$DAYS" days." sleep "$DAYS"d # will repeat the whole process every DAYS days -done +done \ No newline at end of file diff --git a/ethereum/compute_metrics.py b/ethereum/compute_metrics.py index 946766c..b5823d8 100644 --- a/ethereum/compute_metrics.py +++ b/ethereum/compute_metrics.py @@ -160,7 +160,7 @@ def main(): network_metrics = hlp.get_metrics_network() geo_metrics = hlp.get_metrics_geo() - output_dir = pathlib.Path(__file__).parent / "output" + output_dir = hlp.get_output_directory() if not output_dir.exists(): print(f"Error: Output directory not found at {output_dir}", file=sys.stderr) sys.exit(1) diff --git a/ethereum/config.yaml b/ethereum/config.yaml index 914ccac..65977d8 100644 --- a/ethereum/config.yaml +++ b/ethereum/config.yaml @@ -10,9 +10,6 @@ mode: execution_parameters: concurrency: 100 -output_directories: - - ./output - # Metrics for network analysis (organizations) network_metrics: hhi: diff --git a/ethereum/crawler b/ethereum/crawler index 65b7eb9..8df52a0 160000 --- a/ethereum/crawler +++ b/ethereum/crawler @@ -1 +1 @@ -Subproject commit 65b7eb96222e17c44232a1da38e4d3cc1a9dc32f +Subproject commit 8df52a09fc6108bd8c5bcd63d3c7dc3e96a806a7 diff --git a/ethereum/helper.py b/ethereum/helper.py index 56ccd13..2d5cdad 100644 --- a/ethereum/helper.py +++ b/ethereum/helper.py @@ -1,4 +1,5 @@ from yaml import safe_load +import os import pathlib import requests import time @@ -38,17 +39,16 @@ def get_mode(): def get_output_directory(): """ - Reads the config file and retrieves the output directory - :returns: a directory that will contain the output files + Require the `OUTPUT_DIRECTORY` env var set by the caller + :returns: the path provided by `OUTPUT_DIRECTORY` """ - config = get_config_data() + try: + output_dir = pathlib.Path(os.environ['OUTPUT_DIRECTORY']).resolve() + except KeyError: + raise RuntimeError("OUTPUT_DIRECTORY environment variable is not set. Set it in automation.sh and retry.") - output_dir = [pathlib.Path(db_dir).resolve() for db_dir in config['output_directories']][0] if not output_dir.is_dir(): - output_dir.mkdir() - for subdir_type in ['osdata', 'geodata']: - subdir = output_dir / subdir_type - subdir.mkdir() + raise FileNotFoundError(f"OUTPUT_DIRECTORY does not exist: {output_dir}") return output_dir diff --git a/ethereum/parse.py b/ethereum/parse.py index a3b921f..55fe156 100644 --- a/ethereum/parse.py +++ b/ethereum/parse.py @@ -74,7 +74,8 @@ def analyse_distribution(nodes, layer, mode): else: # if the API used for the IP addresses doesn't return any value for the country or the organisation geodata_counter["Unknown"] = geodata_counter.get("Unknown", 0) + len(val) - filename = Path(f'./output/{mode.lower()}_{layer}.csv') + output_dir = hlp.get_output_directory() + filename = output_dir / f'{mode.lower()}_{layer}.csv' if filename.is_file(): df = pd.read_csv(filename) @@ -88,10 +89,10 @@ def analyse_distribution(nodes, layer, mode): df.loc[rows] = [geodata] + [0]*(columns-1) geodata_in_order.append(geodata_counter[geodata]) df[datetime.today().strftime('%Y-%m-%d')] = geodata_in_order - df.to_csv(f'./output/{mode.lower()}_{layer}.csv', index = False) + df.to_csv(filename, index = False) else: geodata_df = pd.DataFrame.from_dict(geodata_counter, orient='index', columns=[datetime.today().strftime('%Y-%m-%d')]) - geodata_df.to_csv(f'./output/{mode.lower()}_{layer}.csv', index_label = mode) + geodata_df.to_csv(filename, index_label = mode) def cluster_organizations(layer): """