/* dejan belic */
#include <stdio.h>

__global__
void aa(int k, int n, int m, double *mat, double *res_mat) {
    int index = threadIdx.x * n + threadIdx.y;

    res_mat[index] = mat[index];
    __syncthreads();

    for (size_t t = 0; t < k; t++) {
        int x_min = (threadIdx.x == 0)? 0 : threadIdx.x-1;
        int x_max = (threadIdx.x == n - 1)? n - 1 : threadIdx.x+1;
        int y_min = (threadIdx.y == 0)? 0 : threadIdx.y-1;
        int y_max = (threadIdx.y == m - 1)? m - 1 : threadIdx.y+1;

        double sum = 0.0;
        int count = 0;

        for (size_t i = x_min; i <= x_max; i++) {
            for (size_t j = y_min; j <= y_max; j++) {
                int temp_index = i * blockDim.x + j;
                sum += res_mat[temp_index];
                count++;
            }
        }

        __syncthreads();
        res_mat[index] = sum / count;
        __syncthreads();
    }
}



int main(int argc, char const *argv[]) {

    int n, m;
    scanf("%d%d", &n, &m);

    int size = n * m * sizeof(double);

    double mat[n][m];
    for (size_t i = 0; i < n; i++)
        for (size_t j = 0; j < m; j++)
            scanf("%lf", &mat[i][j]);

    int k;
    scanf("%d", &k);

    double res_mat[n][m];

    double *d_mat;
    double *d_res_mat;

    cudaMalloc((void **)&d_mat, size);
    cudaMalloc((void **)&d_res_mat, size);
    
    cudaMemcpy(d_mat, mat, size, cudaMemcpyHostToDevice);

    aa<<<1, dim3(n, m, 1)>>>(k, n, m, d_mat, d_res_mat);

    cudaMemcpy(res_mat, d_res_mat, size, cudaMemcpyDeviceToHost);

    for (size_t i = 0; i < n; i++) {
        for (size_t j = 0; j < m; j++)
            printf("%.2lg\t", res_mat[i][j]);
        printf("\n");
    }
    
    return 0;
}
