diff --git a/docs/src/solutions_for_all_notebooks.md b/docs/src/solutions_for_all_notebooks.md
index 292903c..80140b3 100644
--- a/docs/src/solutions_for_all_notebooks.md
+++ b/docs/src/solutions_for_all_notebooks.md
@@ -160,4 +160,57 @@ end
 msg = 2
 @fetchfrom 2 work(msg)
 ```
+## Matrix-matrix multiplication
+
+### Exercise 1
+
+```julia
+function matmul_dist_3!(C,A,B)
+    m = size(C,1)
+    n = size(C,2)
+    l = size(A,2)
+    @assert size(A,1) == m
+    @assert size(B,2) == n
+    @assert size(B,1) == l
+    @assert mod(m,nworkers()) == 0
+    nrows_w = div(m,nworkers())
+    @sync for (iw,w) in enumerate(workers())
+        lb = 1 + (iw-1)*nrows_w
+        ub = iw*nrows_w
+        A_w = A[lb:ub,:]
+        ftr = @spawnat w begin
+             C_w = similar(A_w)
+             matmul_seq!(C_w,A_w,B)
+             C_w
+        end
+        @async C[lb:ub,:] = fetch(ftr)
+    end
+    C
+end
+
+@everywhere function matmul_seq!(C,A,B)
+    m = size(C,1)
+    n = size(C,2)
+    l = size(A,2)
+    @assert size(A,1) == m
+    @assert size(B,2) == n
+    @assert size(B,1) == l
+    z = zero(eltype(C))
+    for j in 1:n
+        for i in 1:m
+            Cij = z
+            for k in 1:l
+                @inbounds Cij = Cij + A[i,k]*B[k,j]
+            end
+            C[i,j] = Cij
+        end
+    end
+    C
+end
+```
+
+### Exercise 2
+
+At each call to @spawnat we will communicate O(N) and compute O(N) in a worker process just like in algorithm 1. However, we will do this work N^2/P times on average at each worker. Thus, the total communication and computation on a worker will be O(N^3/P) for both communication and computation.  Thus, the communication over computation ratio will still be O(1) and thus the communication will dominate in practice, making the algorithm inefficient.
+
 
diff --git a/notebooks/matrix_matrix.ipynb b/notebooks/matrix_matrix.ipynb
index 7968e87..1cc299c 100644
--- a/notebooks/matrix_matrix.ipynb
+++ b/notebooks/matrix_matrix.ipynb
@@ -72,9 +72,10 @@
     "        \"It's not correct. Keep trying! 💪\"\n",
     "    end |> println\n",
     "end\n",
+    "alg_0_comp_check(answer) = answer_checker(answer, \"d\")\n",
     "alg_1_deps_check(answer) = answer_checker(answer,\"b\")\n",
-    "alg_1_comm_overhead_check(answer) = answer_checker(answer, \"c\")\n",
-    "alg_1_comp_check(answer) = answer_checker(answer, \"a\")\n",
+    "alg_1_comm_overhead_check(answer) = answer_checker(answer, \"b\")\n",
+    "alg_1_comp_check(answer) = answer_checker(answer, \"b\")\n",
     "alg_2_complex_check(answer) = answer_checker(answer, \"b\")\n",
     "alg_2_deps_check(answer) = answer_checker(answer,\"d\")\n",
     "alg_3_deps_check(answer) = answer_checker(answer, \"c\")\n",
@@ -88,7 +89,7 @@
    "source": [
     "## Problem Statement\n",
     "\n",
-    "Let us consider the (dense) matrix-matrix product `C=A*B`."
+    "Given $A$ and $B$ two $N$-by-$N$ matrices, compute the matrix-matrix product $C=AB$. Compute it in parallel and efficiently."
    ]
   },
   {
@@ -157,7 +158,7 @@
    "source": [
     "## Serial implementation\n",
     "\n",
-    "We start by considering the (naive) sequential algorithm:"
+    "We start by considering the (naive) sequential algorithm, which is based on the math definition of the matrix-matrix product $C_{ij} = \\sum_k A_{ik} B_{kj}$"
    ]
   },
   {
@@ -188,6 +189,30 @@
     "end"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "e3b86457",
+   "metadata": {},
+   "source": [
+    "Run next cell to test the implementation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c5caf799",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "using Test\n",
+    "N = 10\n",
+    "A = rand(N,N)\n",
+    "B = rand(N,N)\n",
+    "C = similar(A)\n",
+    "matmul_seq!(C,A,B)\n",
+    "@test C ≈ A*B"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "f967d2ea",
@@ -216,6 +241,32 @@
     "@btime mul!(C,A,B);"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "0ca2fbd4",
+   "metadata": {},
+   "source": [
+    "<div class=\"alert alert-block alert-success\">\n",
+    "<b>Question:</b>  Which is the complexity (number of operations) of the serial algorithm? Assume that all matrices are $N$-by-$N$ matrices.    \n",
+    "</div>\n",
+    "\n",
+    "    a) O(1)\n",
+    "    b) O(N)\n",
+    "    c) O(N²)\n",
+    "    d) O(N³)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "078e974e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "answer = \"x\" # replace x with a, b, c, or d \n",
+    "alg_0_comp_check(answer)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "0eedd28a",
@@ -489,10 +540,10 @@
     "<b>Question:</b>  How many scalars are communicated from and to a worker?  Assume that matrices A, B, and C are N by N matrices.\n",
     "</div>\n",
     "\n",
-    "    a) 3N\n",
-    "    b) 2N + 2\n",
-    "    c) 2N + 1\n",
-    "    d) N² + 1"
+    "    a) O(1)\n",
+    "    b) O(N)\n",
+    "    c) O(N²)\n",
+    "    d) O(N³)"
    ]
   },
   {
@@ -515,9 +566,10 @@
     "<b>Question:</b>  How many operations are done in a worker?    \n",
     "</div>\n",
     "\n",
-    "    a) O(N)\n",
-    "    b) O(N²)\n",
-    "    c) O(N³)"
+    "    a) O(1)\n",
+    "    b) O(N)\n",
+    "    c) O(N²)\n",
+    "    d) O(N³)"
    ]
   },
   {
@@ -905,9 +957,9 @@
     "\n",
     "| Algorithm | Parallelism <br>(#workers) | Communication <br>per worker | Computation <br>per worker | Ratio communication/<br>computation |\n",
     "|---|---|---|---|---|\n",
-    "| 1 | N² | 2N + 1 | N | O(1) |\n",
-    "| 2 | N | 2N + N² | N² | O(1) |\n",
-    "| 3 | P | N² + 2N²/P | N³/P | O(P/N) |\n",
+    "| 1 | N² | O(N) | O(N) | O(1) |\n",
+    "| 2 | N | O(N²) | O(N²) | O(1) |\n",
+    "| 3 | P | O(N²) | O(N³/P) | O(P/N) |\n",
     "\n",
     "\n",
     "- Matrix-matrix multiplication is trivially parallelizable (all entries in the result matrix can be computed in parallel, at least in theory)\n",