Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -173,18 +173,20 @@ After having a successful build and a complete `cluster.yaml`, we are ready for
### 1. Run Internal Examples

#### 1.1. Run a Single Example
To run an internal example program (e.g., `examples/all_reduce.cc`), just run:
The repository organizes example suites by collective paradigm layout under `examples/`.

To run an internal example program (e.g., `examples/mpi/all_reduce.cc`), just run:

```bash
# Build and run the executable across the cluster based on the config specified in your `cluster.yaml`
icclrun --config [path-to-your-cluster.yaml] --build all_reduce [program args]
icclrun --config [path-to-your-cluster.yaml] --build mpi/all_reduce [program args]
```

- `--config / -c` : Path to the cluster YAML file.
- `--build` : Instructs `icclrun` to compile the library on each node before execution. If omitted, `icclrun` assumes the library is already installed at a consistent location.
- `--launcher` : Specify the backend launcher to be used. Default to `ompi`.

The executable (e.g., `all_reduce`) and its arguments follow the options.
The executable (e.g., `mpi/all_reduce`) and its arguments follow the options.

For more details, run:

Expand All @@ -207,7 +209,7 @@ chmod +x ./scripts/run_examples.sh
For all the options of the script, see:

```bash
./run_examples --help
./scripts/run_examples.py --help
```

### 2. Run a Custom User Program
Expand Down
42 changes: 28 additions & 14 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,47 +1,61 @@
file(GLOB_RECURSE EXAMPLE_SOURCES "*.cc")

foreach(source_file ${EXAMPLE_SOURCES})
get_filename_component(example_name ${source_file} NAME_WE)
file(RELATIVE_PATH rel_file "${CMAKE_CURRENT_SOURCE_DIR}" "${source_file}")

add_executable(${example_name} ${source_file})
get_filename_component(file_dir ${rel_file} DIRECTORY)
get_filename_component(file_we ${rel_file} NAME_WE)

target_link_libraries(${example_name} PRIVATE infiniccl)
if(file_dir)
string(REPLACE "/" "_" target_name "${file_dir}_${file_we}")
else()
set(target_name ${file_we})
endif()

add_executable(${target_name} ${source_file})

set_target_properties(${target_name} PROPERTIES
OUTPUT_NAME "${file_we}"
RUNTIME_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${file_dir}"
)

target_link_libraries(${target_name} PRIVATE infiniccl)

# Add runtime and backend dependencies for direct runtime/backend usage.
if(WITH_NVIDIA)
target_link_libraries(${example_name} PRIVATE CUDA::cudart)
target_link_libraries(${target_name} PRIVATE CUDA::cudart)
endif()

if(WITH_ILUVATAR)
set_source_files_properties(${source_file} PROPERTIES LANGUAGE CXX)
set_target_properties(${example_name} PROPERTIES
set_target_properties(${target_name} PROPERTIES
RULE_LAUNCH_COMPILE "${ILUVATAR_CUDA_COMPILER} "
)
target_compile_options(${example_name} PRIVATE ${ILUVATAR_CUDA_FLAGS} "-Wno-unused-command-line-argument")
target_link_libraries(${example_name} PRIVATE CUDA::cudart CUDA::cuda_driver)
target_compile_options(${target_name} PRIVATE ${ILUVATAR_CUDA_FLAGS} "-Wno-unused-command-line-argument")
target_link_libraries(${target_name} PRIVATE CUDA::cudart CUDA::cuda_driver)
endif()

if(WITH_METAX)
target_link_libraries(${example_name} PRIVATE ${MACA_RUNTIME_LIB})
target_compile_options(${example_name} PRIVATE "-x" "maca")
target_link_libraries(${target_name} PRIVATE ${MACA_RUNTIME_LIB})
target_compile_options(${target_name} PRIVATE "-x" "maca")
endif()

if(WITH_MOORE)
target_link_libraries(${example_name} PRIVATE ${MUSART_LIB})
target_compile_options(${example_name} PRIVATE "-x" "musa")
target_link_libraries(${target_name} PRIVATE ${MUSART_LIB})
target_compile_options(${target_name} PRIVATE "-x" "musa")
endif()

if(WITH_CAMBRICON)
target_link_libraries(${example_name} PRIVATE ${CAMBRICON_RUNTIME_LIB})
target_link_libraries(${target_name} PRIVATE ${CAMBRICON_RUNTIME_LIB})
endif()

if(WITH_OMPI OR WITH_MPICH)
target_link_libraries(${example_name} PRIVATE MPI::MPI_CXX)
target_link_libraries(${target_name} PRIVATE MPI::MPI_CXX)
endif()

# Explicitly allow examples to "peek" into the internal `src` and binary dirs.
# This is necessary because these were marked `PRIVATE` in the library's CMake.
target_include_directories(${example_name} PRIVATE
target_include_directories(${target_name} PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}
"${PROJECT_SOURCE_DIR}/src" # For internal templates like `runtime.h`.
"${CMAKE_BINARY_DIR}/src" # For the generated `backend_manifest.h`.
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
129 changes: 101 additions & 28 deletions scripts/run_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,75 @@
import subprocess
import sys
from datetime import datetime
from typing import Optional
from pathlib import Path
from typing import List, Optional


# ==============================================================================
# DISCOVERY UTILITIES
# ==============================================================================
def discover_available_examples(examples_root: Path) -> List[dict]:
"""
Scans the `examples/` directory recursively to find all source files.
"""
found = []
for path in examples_root.rglob("*.cc"):
rel_path = path.relative_to(examples_root)

file_dir = rel_path.parent
file_we = rel_path.stem

category = str(file_dir).replace(os.sep, "/") if str(file_dir) != "." else ""

binary_rel_path = f"{category}/{file_we}" if category else file_we

found.append(
{
"absolute_path": path,
"relative_path": str(rel_path).replace(os.sep, "/"),
"category": category,
"name_we": file_we,
"binary_path": binary_rel_path,
}
)
return found


def resolve_targets(input_strings: List[str], available: List[dict]) -> List[dict]:
"""
Resolves user-provided strings into targets.
Supports filtering by:
1. Category directory (e.g. 'mpi') -> Runs everything inside
2. Exact relative path (e.g. 'mpi/all_reduce')
3. Short program name matching (e.g. 'all_reduce') -> Runs all instances
"""
resolved = []
seen_binaries = set()

for query in input_strings:
query = query.strip().replace(os.sep, "/")
if not query:
continue

matched_any = False
for item in available:
if (
query == item["category"]
or query == item["relative_path"]
or query == item["binary_path"]
or query == item["name_we"]
):
if item["binary_path"] not in seen_binaries:
seen_binaries.add(item["binary_path"])
resolved.append(item)
matched_any = True

if not matched_any:
print(
f"⚠️ Warning: Input pattern '{query}' did not match any discovered examples."
)

return resolved


# ==============================================================================
Expand All @@ -19,7 +87,7 @@ def setup_log_directory(directory: str) -> str:


def run_iccl_example(
example_name: str,
target_info: dict,
config_path: str,
launcher_opt: Optional[str],
log_dir: str,
Expand All @@ -28,41 +96,39 @@ def run_iccl_example(
timeout_duration: int,
) -> bool:
"""Executes an example via `icclrun` orchestration framework."""
log_file_path = os.path.join(log_dir, f"{example_name}.log")
binary_path = target_info["binary_path"]
safe_log_name = binary_path.replace("/", "_")
log_file_path = os.path.join(log_dir, f"{safe_log_name}.log")

status_msg = (
"🚀 Running via `icclrun` (with build):"
if trigger_build
else "🚀 Running via `icclrun`: "
)
print(f"{status_msg:<35} {example_name:<20}", end="", flush=True)
print(f"{status_msg:<35} {binary_path:<30}", end="", flush=True)

# Base Command Assembly
cmd = ["icclrun", "--config", config_path]
if launcher_opt and launcher_opt.strip():
cmd.extend(["--launcher", launcher_opt.strip()])

if trigger_build:
cmd.extend(["--build", example_name])
cmd.extend(["--build", binary_path])
else:
cmd.append(example_name)

# Format the exact command as a clean string for log documentation.
exact_command_str = " ".join(cmd)
cmd.append(binary_path)

# Force environment unbuffered stream states for sub-python instances.
custom_env = os.environ.copy()
custom_env["PYTHONUNBUFFERED"] = "1"

try:
with open(log_file_path, "w", buffering=1) as log_file:
# Write the exact underlying command as the absolute first line of the log.
log_file.write(f"[COMMAND]: {exact_command_str}\n")
log_file.write(f"[COMMAND]: {' '.join(cmd)}\n")
log_file.write("=" * 80 + "\n\n")
log_file.flush()

if verbose:
print(f"\n--- [VERBOSE OUTPUT START: `{example_name}`] ---")
print(f"\n--- [VERBOSE OUTPUT START: `{binary_path}`] ---")

# Execute with `Popen` to stream `stdout`/`stderr` live to both terminal and file.
process = subprocess.Popen(
Expand All @@ -80,7 +146,7 @@ def run_iccl_example(
process.wait(timeout=timeout_duration)
return_code = process.returncode
print(
f"--- [VERBOSE OUTPUT END: `{example_name}`] ---\n" + " " * 56,
f"--- [VERBOSE OUTPUT END: `{binary_path}`] ---\n" + " " * 66,
end="",
)
else:
Expand All @@ -106,7 +172,7 @@ def run_iccl_example(
print(f" ❌ TIMEOUT (Exceeded {timeout_duration} seconds)")
with open(log_file_path, "a") as f:
f.write(
f"\n[RUNNER ERROR]: Distributed `icclrun` harness timed out after {timeout_duration} seconds.\n"
f"\n[RUNNER ERROR]: Harness timed out after {timeout_duration} seconds.\n"
)
return False
except FileNotFoundError:
Expand Down Expand Up @@ -142,8 +208,8 @@ def main():
"-e",
"--examples",
type=str,
default="all_reduce",
help="Comma-separated list of example target names to execute.",
default="mpi",
help="Comma-separated paths, categories, or short names (fuzzy match). E.g. 'mpi', 'mpi/all_reduce', or 'all_reduce'.",
)
parser.add_argument(
"-o",
Expand All @@ -168,17 +234,27 @@ def main():

args = parser.parse_args()

# Parse and clean the comma-separated examples list.
examples_to_run = [ex.strip() for ex in args.examples.split(",") if ex.strip()]
script_dir = Path(__file__).parent.resolve()
examples_root = script_dir / "../examples"
if not examples_root.exists():
examples_root = Path("examples").resolve()

if not examples_root.exists():
print("❌ Error: Could not locate 'examples/' directory tree.")
sys.exit(1)

all_available = discover_available_examples(examples_root)

input_queries = [ex.strip() for ex in args.examples.split(",") if ex.strip()]
targets_to_run = resolve_targets(input_queries, all_available)

# Sanity Checks
if not os.path.exists(args.config):
print(f"❌ Error: Config file matching '{args.config}' could not be located.")
sys.exit(1)

if not examples_to_run:
if not targets_to_run:
print(
"❌ Error: No target programs defined. Please check your `--examples` list."
"❌ Error: No valid targets resolved. Please check your `--examples` query configurations."
)
sys.exit(1)

Expand All @@ -199,9 +275,9 @@ def main():
failed_programs = []
is_first_run = True

for example in examples_to_run:
for target in targets_to_run:
success = run_iccl_example(
example_name=example,
target_info=target,
config_path=args.config,
launcher_opt=args.launcher,
log_dir=current_run_log_dir,
Expand All @@ -216,12 +292,9 @@ def main():
passed_count += 1
else:
failed_count += 1
failed_programs.append(example)
failed_programs.append(target["binary_path"])

# ==============================================================================
# METRICS REPORTING
# ==============================================================================
total_programs = len(examples_to_run)
total_programs = len(targets_to_run)
success_rate = (passed_count / total_programs) * 100

print("\n==================================================================")
Expand Down
Loading